Exemple #1
0
def SGDData():
    root    = TAMO.paths.SGDdir
    urlroot = 'ftp://genome-ftp.stanford.edu/pub/yeast/data_download/' 
    files = ['chromosomal_feature/SGD_features.tab',
             'chromosomal_feature/dbxref.tab',
             'chromosomal_feature/chromosome_length.tab',
             'sequence/GenBank/yeast_nrpep.fasta.gz',
             'sequence/genomic_sequence/orf_protein/orf_trans_all.fasta.gz',
             ('http://yeastgfp.ucsf.edu/allOrfData.txt','Huh_Nature_2003.tab')
             ]

    chrs = '01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt'.split()

    files.extend( ['sequence/NCBI_genome_source/chr%s.fsa'%x for x in chrs] )

    downloadfiles(root,urlroot,files)

    from TAMO.seq import Fasta
    
    print "Assembling yeast genome sequence files into a single file (NCBI_yeast_genome.fsa)"
    D = {}
    for chr in chrs:
        _d = Fasta.load('%s/chr%s.fsa'%(TAMO.paths.SGDdir,chr))
        id, seq = _d.items()[0]
        if chr[0] == '0': chr = chr[1]
        D['chr%s  %s'%(chr,id)] = seq
    Fasta.write(D, TAMO.paths.SGDdir + 'NCBI_yeast_genome.fsa')
Exemple #2
0
def SGDData():
    root    = TAMO.paths.SGDdir
    urlroot = 'ftp://genome-ftp.stanford.edu/pub/yeast/data_download/' 
    files = ['chromosomal_feature/SGD_features.tab',
             'chromosomal_feature/dbxref.tab',
             'chromosomal_feature/chromosome_length.tab',
             'sequence/GenBank/yeast_nrpep.fasta.gz',
             'sequence/genomic_sequence/orf_protein/orf_trans_all.fasta.gz',
             ('http://yeastgfp.ucsf.edu/allOrfData.txt','Huh_Nature_2003.tab')
             ]

    chrs = '01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt'.split()

    files.extend( ['sequence/NCBI_genome_source/chr%s.fsa'%x for x in chrs] )

    downloadfiles(root,urlroot,files)

    from TAMO.seq import Fasta
    
    print "Assembling yeast genome sequence files into a single file (NCBI_yeast_genome.fsa)"
    D = {}
    for chr in chrs:
        _d = Fasta.load('%s/chr%s.fsa'%(TAMO.paths.SGDdir,chr))
        id, seq = _d.items()[0]
        if chr[0] == '0': chr = chr[1]
        D['chr%s  %s'%(chr,id)] = seq
    Fasta.write(D, TAMO.paths.SGDdir + 'NCBI_yeast_genome.fsa')
Exemple #3
0
def genomebg(infile,outfile):
    EXE = MDSCAN_DIR + 'genomebg.linux'
    fsaD   = Fasta.load(infile)
    tmpfsa = tempfile.mktemp()
    Fasta.write(fsaD,tmpfsa,linelen=1000000000)
    CMD = '%s -i %s -o %s'%(EXE,tmpfsa,outfile)
    FID = os.popen('( %s ;) 2>&1'%CMD,'r')
    for line in FID.readlines(): print line
    if FID.close(): print "Exited"
    os.unlink(tmpfsa)
Exemple #4
0
def genomebg(infile, outfile):
    EXE = MDSCAN_DIR + 'genomebg.linux'
    fsaD = Fasta.load(infile)
    tmpfsa = tempfile.mktemp()
    Fasta.write(fsaD, tmpfsa, linelen=1000000000)
    CMD = '%s -i %s -o %s' % (EXE, tmpfsa, outfile)
    FID = os.popen('( %s ;) 2>&1' % CMD, 'r')
    for line in FID.readlines():
        print line
    if FID.close(): print "Exited"
    os.unlink(tmpfsa)
Exemple #5
0
def memefiles2tamo(files, tamoname):
    global probefile, PROBESET, fsafile
    
    motifs = []
    for filename in files:
        print ">>>SDFSD>F ",filename
        if   re.search('\.ace$',filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile: mdobject.fastafile=filename.replace('.ace','.fsa')
        elif re.search('\.meme.*$',filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:
                mdobject.fastafile=re.sub('\..\.meme','.meme',filename).replace('.meme','.fsa')
        motifs.extend(mdobject.motifs)

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    if fsafile: fsaname = fsafile
    else:       fsaname = Fasta.find(mdobject.fastafile)
    fsaD    = Fasta.load(fsaname)
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac   == None: motif.frac   = PROBESET.frac(motif,probes,'v',0.7)
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        if 0 and (motif.CRA == None):
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except: pass

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

    MotifTools.save_motifs(motifs,tamoname)
Exemple #6
0
def main():
    seqsD = Fasta.load(sys.argv[1])
    seqs  = seqsD.values()
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1 #Pseudo count
            total = total + 1
        for nmer,count in nmersT[:]:
            try: 
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        print "# freq in %s (total %d with pseudocounts)"%(sys.argv[1],total)
        for nmer in _t:
            print "%-7s %20.17f"%(nmer,float(nmersD[nmer]) / total)
        sys.stdout.flush()
Exemple #7
0
def loadMiRNAs(miRNA_Path):
    """
    Takes fasta file of mature miRNAs.
    Returns dict.
    """
    
    return Fasta.load(miRNA_Path)
Exemple #8
0
def info2seeds(N,infofile,probefile,species='YEAST'):
    G    = ProbeSet(species)
    IDs  = G.ids_from_file(probefile)
    Q    = EM.theMarkovBackground.zeroth()
 
    seqs = Fasta.seqs(infofile)
    
    if not N:
        nmers = seqs
    else:
        nmers= MotifTools.top_nmers(N,seqs)
        if len(nmers) > 1000: nmers = nmers[0:1000]
        
    print "Scoring enrichment of %d nmers from %s"%len(nmers,infofile)
    sys.stdout.flush()
    
    nmers_scoresT = []
    for nmer in nmers:
        if nmer.isalpha():
            p = G.p_value(nmer,IDs,'') #'verbose'
            nmers_scoresT.append((nmer,p))
    nmers_scoresT.sort(lambda x,y: cmp(x[1],y[1]))
    last = min(20,len(nmers_scoresT))
    models = []
    for i in range(last):
        seq = nmers_scoresT[i][0]
        m = MotifTools.Motif('',Q)
        m.compute_from_text(seq,0.1)
        models.append(m)
    for tup in nmers_scoresT[0:40]:
        print tup
    return(models)
Exemple #9
0
def main(fastafile, outDirectory):  # !! 1/2/09 AD added 'fastafile' var and changed 'if __name__' as way to call this from script.
    seqsD = Fasta.load(fastafile)
    seqs  = seqsD.values()
    
    output = []
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1 #Pseudo count
            total = total + 1
        for nmer,count in nmersT[:]:
            try: 
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        output.append("# freq in %s (total %d with pseudocounts)\n"%(fastafile.split('/')[-1],total))  # AD 02-27-09 added a '\n' to make file look right
        for nmer in _t:
            output.append( "%-7s %20.17f\n"%(nmer,float(nmersD[nmer]) / total))  # AD 02-27-09 added a '\n' to make file look right
        
        # open output file and write out results
        outFile = '%s/%s.freq' % (outDirectory, fastafile.split('/')[-1])
        outFile = open(outFile, 'w')
        for index in output:
            outFile.write(index)
Exemple #10
0
def main():
    seqsD = Fasta.load(sys.argv[1])
    seqs = seqsD.values()
    for w in range(1, 7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w, seqs, 'with counts', 'purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1  #Pseudo count
            total = total + 1
        for nmer, count in nmersT[:]:
            try:
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc] = nmersD[rc] + count
                total = total + 2 * count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        print "# freq in %s (total %d with pseudocounts)" % (sys.argv[1],
                                                             total)
        for nmer in _t:
            print "%-7s %20.17f" % (nmer, float(nmersD[nmer]) / total)
        sys.stdout.flush()
Exemple #11
0
def calcStats(fastaPath):
    seqFile = Fasta.load(fastaPath)
    combinedSeq = ''
    
    for each in seqFile:
        combinedSeq += seqFile[each]
    
    combinedSeq= combinedSeq.upper()
    
    seqs       = len(seqFile)
    totNucs    = len(combinedSeq)
    aCnt       = combinedSeq.count('A')
    cCnt       = combinedSeq.count('C')
    gCnt       = combinedSeq.count('G')
    tCnt       = combinedSeq.count('T')
    nCnt       = combinedSeq.count('N')
    nonNs      = aCnt+cCnt+gCnt+tCnt
    n2tot      = float(nCnt)/len(combinedSeq)
    n2nonN     = float(nCnt)/nonNs
    percentGC  = (float(gCnt)+cCnt)/nonNs
    
    
    
    return {'seqLen':seqs,
            'totNucs':totNucs,
            'aCnt':aCnt,
            'cCnt':cCnt,
            'gCnt':gCnt,
            'tCnt':tCnt,
            'nCnt':nCnt,
            'nonNs':nonNs,
            'n2tot':n2tot,
            'n2nonN':n2nonN,
            'percentGC':percentGC}
Exemple #12
0
def orf2pseq(orf):
    global _orfpseqs
    if not _orfpseqs:
        from TAMO.seq import Fasta
        _orfpseqs = Fasta.load(_ORFPSEQS)
        for _orf, pseq in _orfpseqs.items():
            if pseq[-1] == '*': _orfpseqs[_orf] = pseq[:-1]
    if _orfpseqs.has_key(orf): return _orfpseqs[orf]
    else: return ''
Exemple #13
0
def orf2pseq(orf):
    global _orfpseqs
    if not _orfpseqs:
        from TAMO.seq import Fasta
        _orfpseqs = Fasta.load(_ORFPSEQS)
        for _orf,pseq in _orfpseqs.items():
            if pseq[-1] == '*': _orfpseqs[_orf] = pseq[:-1]
    if _orfpseqs.has_key(orf): return _orfpseqs[orf]
    else:                      return ''
Exemple #14
0
    def train_final(self, model, fg, bg, N, beta):
        input_seqs = []
        for s in fg:
            iseq = self.all_probes[s].upper()
            iseq = re.sub(";","",iseq)
            if (re.search("N", iseq)):
                iseq = re.sub("N","",iseq)
            if (len(iseq)>0): input_seqs.append(iseq)

        if (self.refine):
            final_motif = self.train_model(model, input_seqs, beta)        
        else:
            final_motif = self.models[model]
        train_pos = self.get_LLRs(final_motif, fg)
        train_neg = self.get_LLRs(final_motif, bg)
        over_sampled_positive = self.SMOTE([train_pos], N, N)[0]

        #Train SVM to classify our training set
        c_vals = [1.0e-10, 1.0e-4, 1.0e-3, 1.0e-2, 0.05, 0.1, 1.0, 10.0, 100.0]
        best_classifier = None        
        best_err = 1.0
        for c in c_vals:
            classifier = self.SVM_train(over_sampled_positive, train_neg, c)
            train_err = self.SVM_test(classifier, over_sampled_positive, train_neg)
            if (train_err<best_err):
                best_err = train_err
                best_classifier = classifier
        (train_err, fp, fn) = self.SVM_test(best_classifier, train_pos, [], 1)
        if (self.dump):
            motif = {}
            no_motif = {}
            for name, val in zip(fg,train_pos):
                train_err = self.SVM_test(best_classifier, [val], [])
                if (train_err):
                    no_motif[name] = self.all_probes[name]
                else:
                    motif[name] = self.all_probes[name]
            motif_fsa = self.motif_file.split('.')[0] + '.pos.fsa'
            no_motif_fsa = self.motif_file.split('.')[0] + '.neg.fsa'
            Fasta.write(motif, motif_fsa)
            Fasta.write(no_motif, no_motif_fsa)
        return((final_motif, best_classifier, fn))
Exemple #15
0
def loadSeqs(fastaPathList):
    """
    Takes list of paths.  Returns single dict full of seqs found in the files.
    Converts softMasking to hard.
    """
    rDict = {}
    
    for path in fastaPathList:
        rDict.update(Fasta.load(path))
    
    bioDefs.softMaskDict2HardMask(rDict)
    return rDict
Exemple #16
0
def get_seq(chr,start=None,stop=None):
    global ChrD
    if not ChrD:
        from TAMO.seq import Fasta
        ChrD = Fasta.load(SGDdir + 'NCBI_yeast_genome.fsa')
    if (type(chr) != type('')) or (chr.find('chr') != 0):  # 1 -> chr1, 'X' -> chrX
        chr = 'chr%s'%chr
    if (start == None) and chr.find(':') > 0:                  # chr4:454-465 -> chr4, 454, 465
        _chr,_range = chr.split(':')
        chr = _chr
        start, end = _range.split('-')
        start, end = int(start), int(end)
    return ChrD[chr][start-1:end]
Exemple #17
0
def get_seq(chr, start=None, stop=None):
    global ChrD
    if not ChrD:
        from TAMO.seq import Fasta
        ChrD = Fasta.load(SGDdir + 'NCBI_yeast_genome.fsa')
    if (type(chr) != type('')) or (chr.find('chr') !=
                                   0):  # 1 -> chr1, 'X' -> chrX
        chr = 'chr%s' % chr
    if (start == None) and chr.find(':') > 0:  # chr4:454-465 -> chr4, 454, 465
        _chr, _range = chr.split(':')
        chr = _chr
        start, end = _range.split('-')
        start, end = int(start), int(end)
    return ChrD[chr][start - 1:end]
Exemple #18
0
def swp2swp(swp):
    'Converts, when possible, from P014543 to ADR1_YEAST'
    'Only works for yeast right now'
    global _swp2swp
    if not _swp2swp:
        lines = Fasta.keys(_SWPFASTA, key_func=lambda x: x)
        for line in lines:
            toks = line.split()
            text_name = toks[1]
            numeric_name = toks[2]
            if text_name[0:2] == 'SW' and numeric_name[0] == 'P':
                _swp2swp[text_name[3:]] = numeric_name
                _swp2swp[numeric_name] = text_name[3:]
    if _swp2swp.has_key(swp):
        return _swp2swp[swp]
Exemple #19
0
def swp2swp(swp):
    'Converts, when possible, from P014543 to ADR1_YEAST'
    'Only works for yeast right now'
    global _swp2swp
    if not _swp2swp:
        lines = Fasta.keys(_SWPFASTA,key_func=lambda x:x)
        for line in lines:
            toks = line.split()
            text_name  = toks[1]
            numeric_name = toks[2]
            if text_name[0:2] == 'SW' and numeric_name[0] == 'P':
                _swp2swp[text_name[3:]]  = numeric_name
                _swp2swp[numeric_name]   = text_name[3:]
    if _swp2swp.has_key(swp):
        return _swp2swp[swp]
Exemple #20
0
 def go(self):
     """Execution function: runs TAMO.MD.Meme.Meme and catches the output in self.output for access from MDAP."""
     import time
     
     # write a temp fasta file of coregulated seqs to use as input to Meme(file=TempFasta)
     ctimeStr  = time.ctime().replace(' ','_')
     fileName  = 'tempFastaOfCoRegSeqs.MDAP.%s.fas' %(ctimeStr)
     tFasta    = open(fileName, 'w')
     tFastaTxt = Fasta.text(self.coRegSeqs[0])
     tFasta.write(tFastaTxt)
     
     # Call TAMO to do its thing:
     self.output = Meme(file=fileName, width='', extra_args=self.extra_args, bfile=self.bfile)
     
     # delete temp file
     os.remove(fileName)
def LoadDNA(verbose=False):
	###############################################################################
	#
	#	Read DNA seqeuence
	#	Extract sub-sequence to model
	#	Define rules for DNA
	#
	###############################################################################
	START_POS = 0
	dna = ""
	fastafile = params.GetString(DNA_section,"FILE")
	if (fastafile):
		chromo = params.GetString(DNA_section,"CHR")
		chr_start  = params.GetInt(DNA_section,"START")
		chr_end    = params.GetInt(DNA_section,"END")
		if (not chr_end):
			chr_end = params.GetInt(DNA_section,"LENGTH")
			chr_end += chr_start
		if verbose:
			print ("Loading fasta: [%s]\n"%fastafile)

		seqs = Fasta.load(fastafile)

		seqkeys = seqs.keys()
		seqkeys.sort()

		n = 0
		for chr in seqkeys:
			n += len(seqs[chr])
		if verbose:
			print("Genome length = %d, # chromosomes = %d\n"%(n, len(seqkeys)))

		if (seqs.has_key(chromo)):
			seq = seqs[chromo]
			if verbose:
				print("Chr[%s] = %d nt\n"%(chromo,len(seq)))
			dna = seq[chr_start:chr_end]
			if verbose:
				print("DNA[%d:%d] = %d nt\n"%(chr_start,chr_end,len(dna)))
		else:
			if verbose:
				print("Cannot find [%s] chromosome in %s\n"%(chromo, filename))
		if (verbose):
			print("DNA:[%s]\n"%dna)

	return dna
Exemple #22
0
def spawnOrthoGroups(promoterFileList,nWayOrthoList):
    """Takes promoterFileList<listOfPaths> and nWayOrthoList<listOfLists> and spawns the orthoGroup
    objects in a dictionary with keys = 'geneName1:geneName2:etc' that will be used to run the combined
    hypergeometric analysis."""
    
    
    
    # validation
    assert type(promoterFileList) == type([]), \
           '''promoterFileList must be a list of file paths.
           You provided type: "%s"'''\
           % (type(promoterFileList))
    assert type(promoterFileList[0]) == type(''), \
           '''promoterFileList must be a list of file paths.
           promoterFileList[0] != type(''): "%s"'''\
           % (type(promoterFileList[0]))
    
    # load promoters
    allPromoters = {}
    for i in range(len(promoterFileList)):
        oneGenome = Fasta.file2dict(promoterFileList[i])
        for j in oneGenome:
            allKeys = allPromoters.keys()
            assert j not in allKeys, \
                   '''Detected duplicate gene name in promoterFileList! "%s"'''\
                   % (j)
            allPromoters[j] = oneGenome[j]
    
    # Build Groups
    orthoGroups = {}
    for i in range(len(nWayOrthoList)):
        groupDict = {}
        for j in range(len(nWayOrthoList[i])):
            if allPromoters[nWayOrthoList[i][j]]:
                groupDict[nWayOrthoList[i][j]] = allPromoters[nWayOrthoList[i][j]]
            else:
                break # we do not want orthoGroups that are missing members
        
        if len(groupDict) != len(nWayOrthoList[i]):
            break # we do not want orthoGroups that are missing members
        else:
            nWayOrthoList[i].sort()
            orthoGroups[':'.join(nWayOrthoList[i])] = OrthoGroup(groupDict)
            
    return orthoGroups
Exemple #23
0
def main():
    short_opts = 'f:'
    long_opts = ['genome=', 'range=', 'top=', 'pcnt=', 'bgfile=']
    try:
        opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts)
    except getopt.GetoptError:
        print getopt.GetoptError.__dict__
        usage()
    if not opts: usage()

    fastafile = ''
    top_count = 10
    top_pcnt = None
    genome = 'YEAST'
    w_start = 8
    w_stop = 15
    bgfile = MDSCAN_DIR + 'yeast_int.bg'
    for opt, value in opts:
        if opt == '-f': fastafile = value
        if opt == '--genome': genome = value
        if opt == '--top': top_count = int(value)
        if opt == '--pcnt': top_pcnt = float(value)
        if opt == '--range':
            w_start, w_stop = [int(x) for x in value.split(',')]

    print "#" + ' '.join(sys.argv)
    probeids = Fasta.keys(fastafile)
    Genome = MotifMetrics.ProbeSet(genome)

    probeids = Genome.filter(probeids)

    if top_pcnt:
        top_count = max(top_count, int(top_pcnt / 100.0 * len(probeids)))

    theMeta = metaMDscan(fastafile, w_start, w_stop, top_count)

    for m in theMeta.motifs:
        m.pvalue = Genome.p_value(m, probeids, 'v')
        m.church = Genome.church(m, probeids, 'v')
        sys.stdout.flush()

    theMeta.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue))
    print_motifs(theMeta.motifs)
Exemple #24
0
def LoadDNA():
    ###############################################################################
    #
    #	Read DNA seqeuence
    #	Extract sub-sequence to model
    #	Define rules for DNA
    #
    ###############################################################################
    START_POS = 0
    dna = ""
    fastafile = params.GetString(DNA_section, "FILE")
    if (fastafile):
        chromo = params.GetString(DNA_section, "CHR")
        chr_start = params.GetInt(DNA_section, "START")
        chr_end = params.GetInt(DNA_section, "END")
        if (not chr_end):
            chr_end = params.GetInt(DNA_section, "LENGTH")
            chr_end += chr_start

        print("Loading fasta: [%s]\n" % fastafile)

        seqs = Fasta.load(fastafile)

        seqkeys = seqs.keys()
        seqkeys.sort()

        n = 0
        for chr in seqkeys:
            n += len(seqs[chr])

        print("Genome length = %d, # chromosomes = %d\n" % (n, len(seqkeys)))

        if (seqs.has_key(chromo)):
            seq = seqs[chromo]
            print("Chr[%s] = %d nt\n" % (chromo, len(seq)))
            dna = seq[chr_start:chr_end]
            print("DNA[%d:%d] = %d nt\n" % (chr_start, chr_end, len(dna)))
        else:
            print("Cannot find [%s] chromosome in %s\n" % (chromo, filename))
        print("DNA:[%s]\n" % dna)

    return dna
Exemple #25
0
def swp_find_and_format(swp):
    global _swp_seqs
    if not _swp_seqs:
        _swp_seqs = Fasta.load(_SWPFASTA,key_func=lambda x:x)
    hits = []
    for key in _swp_seqs.keys():
        if key[0:60].find(swp) >= 0:
            hits.append(key)
    if not hits:
        return None
    if len(hits) > 1:
        print "# Multiple matches found for %s:"%swp
        for hit in hits: print '#',hit
        return None
    hit = hits[0]
    seq = _swp_seqs[hit]
    txt = ''
    for i in range(0,len(seq),70):
        txt = txt + seq[i:i+70] + '\n'
    return txt
Exemple #26
0
 def go(self):
     """Execution function: coordinates options used and background GC calculation, then runs
     TAMO.MD.AlignAce.MetaAce and catches the output in self.output for access from MDAP.
     Output is TAMO.AligAce result object."""
     import time
     # Calc GC background of genomic sequences representing the
     # entire data set if requested.
     if self.mdapOptions['background'] == 1:
         self.dataStats = seqStats.calcStats(self.mdapArgs[0])
         self.gcback = self.dataStats['percentGC']
         
     # write a temp fasta file of coregulated seqs to use as input to Meme(file=TempFasta)
     ctimeStr  = time.ctime().replace(' ','_')
     fileName  = 'tempFastaOfCoRegSeqs.MDAP.%s.fas' %(ctimeStr)
     tFasta    = open(fileName, 'w')
     tFastaTxt = Fasta.text(self.coRegSeqs[0])
     tFasta.write(tFastaTxt)
     
     # call TAMO to do its thing
     self.output = MetaAce(fileName, self.width, self.iterations, self.gcback)
     pass
Exemple #27
0
def swp_find_and_format(swp):
    global _swp_seqs
    if not _swp_seqs:
        _swp_seqs = Fasta.load(_SWPFASTA, key_func=lambda x: x)
    hits = []
    for key in _swp_seqs.keys():
        if key[0:60].find(swp) >= 0:
            hits.append(key)
    if not hits:
        return None
    if len(hits) > 1:
        print "# Multiple matches found for %s:" % swp
        for hit in hits:
            print '#', hit
        return None
    hit = hits[0]
    seq = _swp_seqs[hit]
    txt = ''
    for i in range(0, len(seq), 70):
        txt = txt + seq[i:i + 70] + '\n'
    return txt
Exemple #28
0
 def __init__(self,fastaSeqs, motifDict, thresh=0.5,window=200):
     
     self.seqMaps = {}
     
     # Get seqs from fasta
     assert type(fastaSeqs) == type('string') or type(fastaSeqs) == type({}),\
            'MapLib arg(fastaSeqs) must be string pointing to file or a seqDict.'
     if type(fastaSeqs) == type('string'):
         seqs = Fasta.load(fastaSeqs)
     elif type(fastaSeqs) == type({}):
         seqs = fastaSeqs
     
     # Instantiate a SeqMap obj for each seq in seqs
     c = 0
     for k in seqs:
         c += 1
         assert c <= 250
         realT1 = time()
         self.seqMaps[k] = SeqMap(k, seqs[k], motifDict, thresh=thresh, window=window)
         realT2 = time()
         print '%.4f\t%s' % (realT2-realT1,c)
Exemple #29
0
def main():
    short_opts = 'f:'
    long_opts  = ['genome=', 'range=', 'top=', 'pcnt=', 'bgfile=']
    try:   opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts)
    except getopt.GetoptError:
        print getopt.GetoptError.__dict__
        usage()
    if not opts: usage()

    fastafile = ''
    top_count = 10
    top_pcnt  = None
    genome    = 'YEAST'
    w_start   = 8
    w_stop    = 15
    bgfile    = MDSCAN_DIR + 'yeast_int.bg'
    for opt,value in opts:
        if opt == '-f':         fastafile = value
        if opt == '--genome':   genome    = value
        if opt == '--top':      top_count = int(value)
        if opt == '--pcnt':     top_pcnt  = float(value)
        if opt == '--range':    w_start,w_stop= [int(x) for x in value.split(',')]

    print "#" + ' '.join(sys.argv)
    probeids = Fasta.keys(fastafile)
    Genome = MotifMetrics.ProbeSet(genome)

    probeids = Genome.filter(probeids)

    if top_pcnt: top_count = max(top_count,int(top_pcnt/100.0 * len(probeids)))

    theMeta = metaMDscan(fastafile,w_start,w_stop,top_count)

    for m in theMeta.motifs:
        m.pvalue = Genome.p_value(m,probeids,'v')
        m.church = Genome.church(m,probeids,'v')
        sys.stdout.flush()

    theMeta.motifs.sort(lambda x,y: cmp(x.pvalue,y.pvalue))
    print_motifs(theMeta.motifs)
Exemple #30
0
def geneList2FastaDict(geneList, sourceFastaPath, hardMasked=True):
    """
    Returns a Dict of requested fasta recs in form SeqName:Sequence.
    Defaults to HardMasked return seqeunces.
    """
    
    sourceDict = Fasta.load(sourceFastaPath)
    
    # make new dict of all genes both in geneList AND sourceDict
    # new dict may be shorter than geneList!!!!!!
    
    newDict = {}
    for i in geneList:
        if sourceDict[i]:
            newDict[i] = sourceDict[i]
            
    print "%s genes names given, %s found." % (len(geneList), len(newDict))
    
    if hardMasked:
        softMaskDict2HardMask(newDict)
    
    return newDict
Exemple #31
0
 def __init__(self,species='YEAST',seqs=''):
     if  seqs:
         self.sourcefile = 'Runtime (%d sequences)'%len(seqs)
     elif  species.find('.6MBG') >= 0:
         self.sourcefile = species
     elif species[0:5] == 'YEAST':
         self.sourcefile = TAMO.paths.Whiteheaddir+'Yeast6kArray/yeast.intergenic.6.freq'
         TAMO.paths.CHECK(self.sourcefile,'Whitehead')
     elif species[0:5] == 'HUMAN':
         self.sourcefile = TAMO.paths.Whiteheaddir+'Human13kArray/human_elongated_probbesQC250.6MBG'
         TAMO.paths.CHECK(self.sourcefile,'Whitehead')
     elif os.path.exists(re.sub('.fsa|.fasta','.6MBG',species)):
         self.sourcefile = re.sub('.fsa|.fasta','.6MBG',species)
     elif os.path.exists(species) and (species.find('.fsa') >=0):
         self.sourcefile = species
         print "EM.MarkovBackground: Computing background from %s"%species
         sys.stdout.flush()
         self.freqs_from_seqs(Fasta.seqs(species))
     #elif os.path.exists(species):
     #    self.sourcefile = species
     else:
         print 'EM.MarkovBackground: Unknown species %s, using Yeast'
         self.sourcefile = TAMO.paths.Whiteheaddir+'Yeast6kArray/yeast.intergenic.6.freq'
         TAMO.paths.CHECK(self.sourcefile,'Whitehead')
     self.species = species
     self.D  = {}
     self.F  = {}         #Frequencies
     self.CP = {}         #log2(Conditional Probabilities)  CP['ACTG'] = p( G | ACT ) 
     self.nmers_by_size = map(lambda x:[],range(0,10))
     self.highestorder = 0
     if seqs:
         print "EM.MarkovBackground: Computing background from %d sequences"%len(seqs)
         self.freq_from_seqs(seqs)
     else:
         self.freq_from_file()
     self.compute_conditional()
     self.totD = {}
Exemple #32
0
    def __init__(self, fg_file, bg_file, cv_level, markov_file):
        self.cv_level = cv_level
        self.randomize = 0
        self.beta = 0.0
        self.delta = 0.001
        self.refine = 1
        self.motif_file = 'dummy.out'
        self.dump = 0
        self.family = ''
        self.datafiles = (fg_file,bg_file)
        
        MAX_FG = 2000
        
        #LOAD MARKOV BACKGROUND#
        print "Loading Markov background file from %s"%markov_file
        EM.loadMarkovBackground(markov_file)    

        ##################################################################################
        #divide input sequences into groups according to the desired cross-validation level
        ###################################################################################
        print "Processing input sequences...."
        self.fg_seqs = Fasta.load(fg_file)   #load foreground sequences
        for key in self.fg_seqs.keys():
            fseq = self.fg_seqs[key]
            self.fg_seqs[key] = fseq.split()[0]
        self.all_probes = Fasta.load(bg_file)   #load background sequences
        Fasta.delN(self.fg_seqs)
        Fasta.delN(self.all_probes)

        #first delete any sequences from background that are present in foreground
        for key in self.fg_seqs.keys():
            if (self.all_probes.has_key(key)):
                del self.all_probes[key]

        for key in self.all_probes.keys():
            if ((len(self.all_probes[key])==0) or (re.search('[SWMKRY]', self.all_probes[key]))):
                del self.all_probes[key]
                print "deleting %s"%key
                
        while (len(self.fg_seqs.keys())>MAX_FG):
            del self.fg_seqs[self.fg_seqs.keys()[random.randint(0,(len(self.fg_seqs.keys())-1))]]
Exemple #33
0
from TAMO.seq import Fasta
from gusPyCode.defs import bioDefs

miRNAFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.fa'
seedFile  = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.seeds.ctrl.fa'

oligoType = 'control' # 'match' or 'control'
assert oligoType == 'match' or 'control', 'oligoType MUST be only "match" or "control".'

# Load miRNA fastas into dict.
miRNAs = Fasta.file2dict(miRNAFile)

# Create new dict for seeds.
seeds = {}

# 1) Cycle through miRNA dict taking 7mers starting at pos 1 
#    and then pos2. Adapt key to reflect which. 
# 2) Convert to all uppers and convert U's to T's
# 3) If oligoType == 'match', rvcmp each 7mer and adapt key
#    to reflect which.
for miRNA in miRNAs:
    pos1_seed = miRNAs[miRNA][:7].upper().replace('U','T')
    pos2_seed = miRNAs[miRNA][1:8].upper().replace('U','T')


    if oligoType == 'match':
        seeds[miRNA+'_match_pos1'] = bioDefs.revComp(pos1_seed)
        seeds[miRNA+'_match_pos2'] = bioDefs.revComp(pos2_seed)
    else:
        seeds[miRNA+'_ctrl_pos1'] = pos1_seed
        seeds[miRNA+'_ctrl_pos2'] = pos2_seed
Exemple #34
0
 def freq_from_fasta(self,fastafile):
    seqsD = Fasta.load(sys.argv[1])
    seqs  = seqsD.values()
    self.freq_from_seqs(seqs)
Exemple #35
0
from TAMO.MotifTools import top_nmers,Motif
from TAMO import MotifTools
from TAMO.seq import Fasta
from gusPyCode.defs.bioDefs import ifKmerInAll

seqFile     = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.fas'
outFile     = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.8mersInAll.txt'
kmerSize    = 8
scoreThresh = 0.999999

seqs = Fasta.file2dict(seqFile)



# create new dict to store the seqs' kmers
seqsKmers = {}
for i in seqs:
    seqsKmers[i] = top_nmers(kmerSize,[seqs[i]], purge_Ns = 1)   # for some reason top_nmers fails silently if given str instead of list

inAllSeqs = []
count = 0
for seq in seqsKmers:
    for kmer in seqsKmers[seq]:
        if ifKmerInAll(kmer,seqs,scoreThresh):
            if kmer not in inAllSeqs:
                inAllSeqs.append(kmer)
                count+=1
                print count


outFile = open(outFile, 'w')
Exemple #36
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:", ["help", "output="])
    except getopt.GetoptError:
        usage()
        sys.exit(1)
    if not opts:
        usage()
        sys.exit(1)
        

    print "#" + ' '.join(sys.argv)
    fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.7)
    ambigs = []

    scale   = 50.0 / 1000.0
    
    motifs = []
    for opt, value in opts:
        #print opt, value
        if   opt == '-f':  fastafile = value
        elif opt == '-m':  motifs.extend(MotifTools.txt2motifs(value))
        elif opt == '-n':  motifnums = [int(x) for x in value.split(',')]
        elif opt == '-L':  labels    = list(value)
        elif opt == '-t':  thresh    = float(value)
        elif opt == '-a':  ambigs.extend(value.split(','))
        elif opt == '-S':  scale     = float(value)
        
    probes = Fasta.load(fastafile)
    
    if motiffile:
        motifs.extend(TAMO.tamofile2motifs(motiffile))
    if ambigs:
        for ambig in ambigs:
            motifs.append( MotifTools.Motif_from_text(ambig,0.1) )
    if not motifnums:  motifnums = range(len(motifs))
    print '# %d: %s'%(len(motifs),motifnums)
    for i in range(len(motifnums)):
        motif = motifs[motifnums[i]]
        if labels and i < len(labels):
            txt = labels[i]
        else:
            txt = '%d'%i
        print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh)

    probehits = {}
    for key in probes.keys():
        hits_by_motif = []
        save_flag     = 0
        if re.search('[BDHU]',probes[key]): continue
        for num in motifnums:
            result = motifs[num].scan(probes[key],thresh*motif.maxscore)
            if result[0]:
                hits_by_motif.append(result)
                save_flag = 1
            else:
                hits_by_motif.append(None)
        if save_flag:
            probehits[key]=hits_by_motif

    #scale   = .1
    maxw = 40
    for key in probehits.keys():
        l       = len(probes[key])
        a       = list('-'* int(scale*l) )
        a.extend( list(' '*10 ) )
        desc    = []
        matches = probehits[key]
        for i in range(len(matches)):
            if matches[i]:
                subseqs,endpoints,scores = matches[i]
                for idx in range(len(subseqs)):
                    start,stop = endpoints[idx]
                    subseq     = subseqs[idx]
                    score      = scores[idx]
                    if labels and (i<len(labels)): ID = labels[i]
                    else                         : ID = '%d'%i
                    desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score))
                    start = int(start*scale)
                    for offset in range(10):
                        if a[start+offset] == '-':
                            if labels and (i < len(labels)):
                                a[start+offset] = labels[i]
                            else:
                                a[start+offset] = '%d'%i
                            break
        print '%-14s %s'%(key,''.join(a)),
        print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc])
        
    print
    print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
Exemple #37
0
def main():
    if len(sys.argv) < 3:
        print "Usage: %s <fasta_file> [width = None ] [options]" % (re.sub(
            '^.*/', '', sys.argv[0]))
        print "Options include:"
        print "                  -valid  <tf_name> Check answers against Transfac"
        print " EM Parameters:"
        print "                  -beta    [0.01]   Beta for pseudocounts"
        print "                  -seedbeta[0.02]   Beta for pseudocounts for seeds from text"
        print "                  -gamma   [0.2]    Gamma (fraction of sequences)"
        print "                  -delta   [0.001]  Convergence criteria"
        print " "
        print " Seeds (not actually proper priors)"
        print "                  -prior            Seqences or motifs for seeds (may be repeated)"
        print "                  -top N   [0]      Include w-mers in top N probes"
        print "                  -gap    string    sample gapped motifs"
        print "                  -TF               Seed with (all) TRANSFAC PSSMs (buggy)"
        print "                  -info <file.info> for structural priors"
        print "                  -pad              add NN..NN to seed"
        print " "
        print " Genome / Background model "
        print "                  -human (250,1000) Use Human Background model"
        print "                  -Y2K, -Y5C        Use Yeast Upstream Intergenic regions (2000, 500)"
        print "                  -B                Use Bacterial Orfs"
        print " "
        print "Examples:"
        print " %s t.fsa 5 -prior GGGTA -prior AAAAAC " % (
            sys.argv[0].split('/')[-1])
        print "   will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC"
        print
        print " %s t.fsa 5 -info CUP9.info -gamma 0.5 " % (
            sys.argv[0].split('/')[-1])
        print "   will start an EM with Enriched seeds in CUP9.info, with"
        print "   Gamma expectation of 50% of all probes"
        print
        print " %s t.fsa -prior MCM1_5.tamo:0 " % (sys.argv[0].split('/')[-1])
        print "   will start an EM with 0th motif of the file MCM1_5.tamo"
        print "   as a seed"
        print
        sys.exit(1)
    fastafile = sys.argv[1]

    #Echo the command line
    print "#" + ' '.join(map(lambda x: re.sub(' ', '\ ', x), sys.argv))

    if sys.argv[2].isdigit():
        width = sys.argv[2]
    else:
        width = None

    algorithm = ''
    beta = ''
    seedbeta = ''
    cbeta = ''
    deltamin = ''
    gamma = 0.2
    infofile = ''
    seedmodels = []
    species = 'YEAST'
    valid_tfs = []
    gapped_syl = None
    gapflank = 0
    gapweight = 0.2
    enrichfact = 0.7
    pmax = 0  #False
    TFSEEDS = 0
    TFMids = []
    pad = None
    padlen = 0
    thetas = []
    seed_count = 0  #Default: Take the top 0
    seed_s = []  #Initialize seq array
    sp_seed = 0
    '''Parse command-line arguments'''
    for tok, i in zip(sys.argv, xrange(len(sys.argv))):
        if tok == '-top': seed_count = int(sys.argv[i + 1])
        elif tok == '-greedy': algorithm = "GREEDY"
        elif tok == '-prior': seed_s.append(sys.argv[i + 1])
        elif tok == 'sp': sp_seed = 1
        elif tok == '-beta': beta = float(sys.argv[i + 1])
        elif tok == '-beta': seedbeta = float(sys.argv[i + 1])
        elif tok == '-cbeta': cbeta = float(sys.argv[i + 1])
        elif tok == '-thetas':
            for j in range(int(sys.argv[i + 1])):
                thetas.append(float(sys.argv[i + j + 2]))
        elif tok == '-gamma':
            gamma = float(sys.argv[i + 1])
        elif tok == '-delta':
            deltamin = float(sys.argv[i + 1])
        elif tok == '-info':
            infofile = sys.argv[i + 1]
        elif tok == '-valid':
            valid_tfs.append(sys.argv[i + 1])
        elif tok == '-w':
            width = sys.argv[i + 1]
        elif tok == '-width':
            width = sys.argv[i + 1]
        elif tok == '-gap':
            gapped_syl = sys.argv[i + 1]
        elif tok == '-gapflank':
            gapflank = int(sys.argv[i + 1])
        elif tok == '-gapweight':
            gapweight = float(sys.argv[i + 1])
        elif tok == '-enrichfact':
            enrichfact = float(sys.argv[i + 1])
        elif tok == '-pmax':
            pmax = 1
        elif tok == '-Y2K':
            species = "YEAST_2000_UP"
        elif tok == '-Y5C':
            species = "YEAST_500_UP"
        elif tok == '-B':
            species = "BAC_ORF"
        elif tok == '-Ch22':
            species = "Ch22"
        elif tok == '-genome':
            species = sys.argv[i + 1]
        elif tok == '-pad':
            pad = sys.argv[i + 1]
            padlen = sys.argv[i + 2]
        elif tok == '-TF':
            TFSEEDS = 1
            for j in range(i + 1, len(sys.argv)):
                if re.match('M0', sys.argv[j]):
                    TFMids.append(sys.argv[j])
                else:
                    break
        elif tok == '-human':
            _s = ''
            if sys.argv[i + 1].isdigit(): _s = '_' + sys.argv[i + 1]
            else: _s = ''
            species = 'HUMAN' + _s

    seqs = []
    fsaD = Fasta.load(fastafile)
    probes = fsaD.keys()
    '''
    for probeid in fsaD.keys():
        seqs.append  (fsaD    [probeid])
    '''
    numprobes = len(probes)
    #print "numprobes: %i"%numprobes
    if not ('-random_background' in sys.argv or '-nomarkov' in sys.argv):
        EM.loadMarkovBackground(fastafile, numprobes, species)

    #seqs     = EM.fasta2seqs(fastafile)
    all_seqs = seqs
    seed_s.extend(seqs[0:min(seed_count, len(seqs))])
    #not necessary --- seed_c.extend(c_seqs[0:min(seed_count,len(seqs))])

    if infofile and width == 'info':
        width = info2width(infofile)
    elif width != None:
        width = int(width)

    #Alternate source of seeds
    if infofile:
        if 1 or width:
            seedmodels.extend(info2seeds(width, infofile, fastafile, species))
        else:
            print 'Error: need to specify motif width w/ .info file'

    #Any -prior pointers to motifs in other files?
    (seed_s, motifs) = parse_priors(seed_s)
    seedmodels.extend(motifs)

    #Should we get seeds from TRANSFAC?
    if TFSEEDS:
        tf = []
        D = tfmats()
        if not TFMids:
            keys = D.keys()
        else:
            keys = []
            for TFMid in TFMids:
                for key in D.keys():
                    if key[0:6] == TFMid:
                        keys.append(key)
                        break
        for key in keys:
            m = D[key]
            m.seednum = int(re.sub('M0*', '', key.split()[0]))
            m.seedtxt = '%-24s %s' % (m, key)
            tf.append(m)
        tf.sort(lambda x, y: cmp(x.seednum, y.seednum))
        seedmodels.extend(tf)
        #seedmodels.append(tf[33])

    if gapped_syl:
        gapped_priors = gapped_motifs(gapped_syl)
        gapped_priors = map(lambda x: 'N' + x + 'N', gapped_priors)
        seed_s.extend(gapped_priors)

    if pad:
        print '# Padding models with NN-m-NN'
        newmodels = []
        for m in seedmodels:
            newmodels.append(m[-2, m.width + 2])
        seedmodels = newmodels
    '''
    Set everything up and GO!!
    '''
    global theEM
    theEM = EM.EM(seed_s, [], [], width, "VERBOSE")
    if beta: theEM.beta = beta
    if cbeta: theEM.cbeta = cbeta
    if deltamin: theEM.deltamin = deltamin
    if seedbeta: theEM.seedbeta = seedbeta
    if thetas: theEM.thetas = thetas
    theEM.param['gamma'] = gamma
    theEM.probeids.extend(probes)
    theEM.seqs.extend(all_seqs)
    #theEM.cons_seqs.extend(c_seqs)
    theEM.models = seedmodels
    theEM.gapflank = gapflank
    theEM.gapweight = gapweight
    theEM.report()
    theEM.EM_Cstart()  #GO!!

    #print "#Sorting candidates"
    #sys.stdout.flush()
    #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP))

    #sys.exit(0)
    '''
    Compute some metrics
    '''
    print "#Loading Genome %s" % species
    sys.stdout.flush()
    if species == 'human':
        Genome = ProbeSet('HUMAN', enrichfact)
    else:
        Genome = ProbeSet(species, enrichfact)
    ids = Genome.ids_from_file(fastafile)

    #fsaDict = Fasta.load(fastafile)
    #probes = fsaDict.keys()
    #cons_pickle = fastafile.split('.')[0] + '.cpickle'
    for C in theEM.candidates:
        #p_cons = conservation_pvalue(C.pssm,probes,fsaDict,ConsDict,4)
        #print p_cons
        if not pmax:
            w_dict = Genome.w_dict
            for key, i in zip(w_dict.keys(), range(len(C.pssm.thetas))):
                w_dict[key] = C.pssm.thetas[i]
            Genome.w_dict = w_dict
            C.pssm.pvalue = Genome.p_value(C.pssm, ids, 'verbose')
            #print "P-VAL: %f"%(Genome.p_value(C.pssm,ids,'verbose')*p_cons)
            C.pssm.church = Genome.church(C.pssm, ids)
        else:
            (p, frac) = Genome.best_p_value(C.pssm, ids)
            C.pssm.pvalue = p
            C.pssm.threshold = frac * C.pssm.maxscore
            print "Bests:", p, frac

    for valid_tf in valid_tfs:
        C.pssm.valid = Validate.validate(C.pssm, valid_tf, 'Verbose',
                                         "Want Tuple")
    '''
    Print out all motifs (sorted by Enrichment) in an AlignACE-like form
    '''

    theEM.candidates.sort(lambda x, y: cmp(x.pssm.pvalue, y.pssm.pvalue))
    for C, i in zip(theEM.candidates, range(len(theEM.candidates))):
        C.pssm.maxscore = -100  #May have side effects.  Recompute when done
        if C.pssm.valid:
            _t = C.pssm.valid
            if not _t[0]:
                vstring = "(--- %8.4f %8.4f %s)" % (_t[1], _t[2], _t[3])
            else:
                vstring = "(HIT %8.4f %8.4f %s)" % (_t[1], _t[2], _t[3])
        else:
            vstring = ''
        C.pssm._maxscore()  #Recomputed
        print "Log-odds matrix for Motif %3d %s" % (i, C)
        C.pssm._print_ll()
        print "Sequence Logo"
        C.pssm._print_bits()
        flush()
        #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9))
        flush()
        m = C.pssm
        if not m.__dict__.has_key('gamma'):
            m.gamma = None  #Kludge to deal w/ old shelves
        if m.seedtxt: print "Seed: %3d %s" % (i, m.seedtxt)
        if m.source: print "Source: ", m.source
        if m.gamma: print "Gamma: %7.5f" % m.gamma
        if m.threshold: print "Threshold: %5.2f" % m.threshold
        if m.thetas != []:
            tstr = "thetas:"
            for theta in m.thetas:
                tstr = tstr + " " + str(theta)
            print tstr
        #if C.pssm.seedtxt:
        #    print 'Seed  %3d %-25s'%(i,C.pssm.seedtxt)
        if C.pssm.church != None:
            vstring = 'ch: %5.2f  %s' % (math.fabs(
                math.log(C.pssm.church) / math.log(10)), vstring)
        print "Motif %3d %-25s  nlog(p): %6.3f  %s" % (
            i, C, -math.log(C.pssm.pvalue) / math.log(10), vstring)
        if C.pssm.threshold:
            print "Threshold: %6.3f  %4.1f%%" % (
                C.pssm.threshold, 100.0 * C.pssm.threshold / C.pssm.maxscore)

        C.pssm.maxscore = -1e100  #May have side effects.  Recompute when done
        for seq in C.wmers:
            print seq, i, C.pssm.scan(seq)[2][0]
        C.pssm._maxscore()  #Recomputed
        print '*' * len(seq)
        print "MAP Score: %f" % C.MAP
        sys.stdout.flush()
    sys.stdout.flush()
    sys.exit(0)  #Avoid ridiculous python cleanup times
Exemple #38
0
def info2seeds(N, infofile, probefile, species='YEAST'):
    if species == 'human':
        species = 'HUMAN'
    G = ProbeSet(species)
    IDs = G.ids_from_file(probefile)
    Q = EM.theMarkovBackground.zeroth()
    seqs = []

    if re.search('.info$', infofile):
        #I    = infoana.Infofile(infofile,'DONT REMOVE QUERY')
        I = infoana.Infofile(infofile)
        print "# Loading infofile: %s" % infofile
        print I
        seqs = map(lambda x: 'NNNN%sNNNN' % x, I.bsites2seqs(50.0))
    elif re.search('.fsa$', infofile):
        fsaDict = Fasta.load(infofile)
        probes = fsaDict.keys()
        #sequence_repository = KenzieSequences()
        cons_pickle = infofile.split('.')[0] + '.cpickle'
        try:
            CFH = open(cons_pickle, 'r')
            ConsDict = pickle.load(CFH)
            CFH.close()
        except:
            ConsDict = {}
            for probe in probes:
                seqs = []
                cons = []
                try:
                    seq_list = G.alignments[probe]
                except:
                    continue
                if (seq_list != []):
                    cer_seq = seq_list[0][1]
                else:
                    cer_seq = ''
                cer_seq = cer_seq.upper()
                numg = len(seq_list) - 1
                for i in range(1, 4):
                    try:
                        seqs.append(seq_list[i][1].upper())
                    except:
                        seqs.append('')
                    cons.append([])
                for position in range(len(cer_seq)):
                    ref = cer_seq[position]
                    for i in range(3):
                        if (seqs[i] == ''): continue
                        if (seqs[i][position] != ref):
                            cons[i].append(1)
                        else:
                            cons[i].append(0)
                ConsDict[probe] = cons
            CFH = open(cons_pickle, 'w')
            pickle.dump(ConsDict, CFH)
            CFH.close()

        for probe in probes:
            superseq = ''
            try:
                seq_list = G.alignments[probe]
            except:
                continue
            for seq in seq_list:
                subseq = seq[1].replace('-', '')
                subseq = subseq.replace('.', '')
                seqs.append(subseq)

    if not N:
        nmers = seqs
    else:
        if (N < 11):
            nmers = ConvergeMotifTools.top_nmers(N, seqs)
        else:
            gaplen = N - 2 * (N / 3)
            gr = ''
            for i in range(gaplen):
                gr = gr + 'N'
            nmers = ConvergeMotifTools.top_nmers(N, seqs, 0, '', 1)
            gnmers = []
            for nmer in nmers:
                gnmers.append(nmer[0:(N / 3)] + gr + nmer[(N / 3):2 * (N / 3)])
            nmers = gnmers
        if len(nmers) > 201: nmers = nmers[0:200]

    print "Scoring enrichment of %d nmers from .info file" % len(nmers)

    nmers_scoresT = []
    for nmer in nmers:
        if nmer[0:(N / 3)].isalpha():
            p = G.p_value(nmer, IDs, 'verbose')
            #if (species=='Ciona'): ng = 2
            #else: ng = 4
            #p_cons = conservation_pvalue(nmer,IDs,fsaDict,ConsDict,ng)
            #if (p_cons<0.1):
            nmers_scoresT.append((nmer, p))
    nmers_scoresT.sort(lambda x, y: cmp(x[1], y[1]))
    #for tup in nmers_scoresT:
    #    print tup
    last = min(20, len(nmers_scoresT))
    models = []
    for i in range(last):
        seq = nmers_scoresT[i][0]
        m = ConvergeMotifTools.Motif('', Q)
        m.compute_from_text(seq, 0.1)
        models.append(m)
    return (models)
Exemple #39
0
from TAMO.seq import Fasta


fasFile = '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.fas'
oFile1= '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.sample2.fas'
oFile2= '/Users/biggus/Documents/James/Writings_Talks/Grants/09_Feb/PrelimData_Grant_Feb09/Clus2_247genes.test2.fas'

firstDic, secDic = Fasta.random_split(fasFile,0.25)

Fasta.write(firstDic,oFile1)
Fasta.write(secDic,oFile2)

print 'done'
Exemple #40
0
from TAMO import MotifTools 
from TAMO.seq import Fasta 
from TAMO.MotifMetrics import ProbeSet 
from TAMO.MD.AlignAce import AlignAce 
from TAMO.MD.MDscan import MDscan 
from TAMO.MD.Meme import Meme 
#from TAMO.DataSources import GO
from time import time

fastaPath    = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Fastas/TC-96.oneLine.fas'
clusterIDS   = Fasta.ids(fastaPath)
totalSeqs    = ProbeSet(fastaPath)  # !! this is wrong should proly be goodAffys

MDbg         = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.MD.bg'

outFile      = '/Users/biggus/Documents/James/Data/ClusterDefs/testTAMOmetrics.txt'

#theAce = AlignAce(fastaPath,width=10)

print 'running MDscan...'
tMD_1 = time()
MDmotifs   = MDscan(fastaPath) #,bgfile=MDbg)
tMD_2 = time()
MD_time = tMD_2-tMD_1
print 'MDscan took %.5f sec == %.3f min.\nMDscan found %s motifs.' % (MD_time,MD_time/60.0, len(MDmotifs.motifs))

print 'running MEME...'
tMeme_1 = time()
memeMotifs = Meme(fastaPath)
tMeme_2 = time()
Meme_time = tMeme_2-tMeme_1
Exemple #41
0
def motif_matrix(fsa, motif, outfile, genome='mm9'):
    if genome == 'hg18':
        markov = "/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov"
    else:
        markov = "/nfs/data/cwng/chipseq/hypotheses/Mouse.markov"

    #Load motif and background adjust PSSM
    m = MotifTools.load(motif)
    EM.loadMarkovBackground(markov)
    bg = EM.theMarkovBackground.zeroth()
    F = Fasta.load(fsa, key_func=lambda x: x)
    seqs = F.values()
    n_seqs = len(seqs)
    n_motifs = len(m)
    SCORES = np.zeros((n_motifs, n_seqs), dtype='float')
    #SHIFTS=np.zeros((n_motifs,n_seqs))

    #out=open(outfile,'w')
    for i, M in enumerate(m):
        ll = M.logP
        EM.loadMarkovBackground(markov)
        bg = EM.theMarkovBackground.zeroth()
        for pos in ll:
            for letter in pos.keys():
                pos[letter] = pos[letter] - math.log(
                    bg[letter]) / math.log(2.0)
        AM = MotifTools.Motif_from_ll(ll)
        #adj_model = MotifTools.Motif_from_ll(ll)
        #adj_model.source = M.source
        #pssm = MDsupport.Motif2c_PSSM(adj_model)
        #w=pssm.width

        #shift=[]
        #scores=[]
        mi, ma = AM.minscore, AM.maxscore

        #F_m={}
        #Search every seq for given motif above threshold t and print motif centered results
        for j, seq in enumerate(seqs):
            seq_fwd = seq.upper()
            #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1]
            #scores_fwd = pssm.score_probe(seq_fwd)
            #scores_rev = pssm.score_probe(seq_rev)
            #max_score=mi
            #max_ind=0
            #for ind,s in enumerate(scores_fwd):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='+'
            #for ind,s in enumerate(scores_rev):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='-'
            max_score = AM.bestscore(seq_fwd)
            mscore = (max_score - mi) / (ma - mi)
            #orig=len(seq_fwd)/2
            #bind=max_ind+w//2
            #d=abs(orig-bind)
            SCORES[i, j] = mscore
            #SHIFTS[i,j]=d
            #out.write('%1.3f\t'%mscore)
        #out.write('\n')
    #out.close()
    #del F
    np.savetxt(outfile, SCORES, fmt='%1.3f')
Exemple #42
0
from TAMO.seq import Fasta
from gusPyCode.defs.bioDefs import geneList2FastaDict
from gusPyCode.defs.mosqData import promoterSeqPaths


geneList = map(lambda l: l.strip(), \
               open('/Users/biggus/Documents/James/Collaborations/Campbell/data/CCupAt4Days.gte2x.genes.txt', 'rU'))

sourceFasta = promoterSeqPaths.Aa_2000bpUp_hardMasked_shuf1

oFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/CCupAt4Days.gte2x.masked.shuffled.1.fas'





newFasta = geneList2FastaDict(geneList, sourceFasta, hardMasked=True)

newFasta = Fasta.text(newFasta)
    
oFile = open(oFile, 'w')
oFile.write(newFasta)

print 'Done'
Exemple #43
0
def memefiles2tamo(files, tamoname):
    global probefile, PROBESET

    motifs = []
    for filename in files:
        print ">>>SDFSD>F ", filename
        if re.search('\.ace$', filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = filename.replace('.ace', '.fsa')
        elif re.search('\.meme.*$', filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = re.sub('\..\.meme', '.meme',
                                            filename).replace('.meme', '.fsa')
        motifs.extend(mdobject.motifs)

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    fsaname = Fasta.find(mdobject.fastafile)
    fsaD = Fasta.load(fsaname)
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key, seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        if motif.E_site == None:
            motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if re.search('\.meme$', filename):
            motif.MAP = -math.log(motif.evalue) / math.log(10)
        if 1 and (motif.CRA == None):
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                                                   probes,
                                                   'v',
                                                   tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except:
                pass

    if re.search('\.meme$', filename):
        mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church))

    MotifTools.save_motifs(motifs, tamoname)
Exemple #44
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:i:", ["help", "output="])  # AD added 'i'
    except getopt.GetoptError:
        usage()
        sys.exit(1)
    if not opts:
        usage()
        sys.exit(1)
        

    print "#" + ' '.join(sys.argv)
    fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.75) # AD changed thresh val to 0.75 from 0.7
    ambigs = []

    scale   = 50.0 / 1000.0
    
    motifs = []
    for opt, value in opts:
        #print opt, value
        if   opt ==  '-f':  fastafile = value
        elif opt ==  '-m':  motifs.extend(MotifTools.txt2motifs(value))
        elif opt ==  '-n':  motifnums = [int(x) for x in value.split(',')]
        elif opt ==  '-L':  labels    = list(value)
        elif opt ==  '-t':  thresh    = float(value)
        elif opt ==  '-a':  ambigs.extend(value.split(','))
        elif opt ==  '-S':  scale     = float(value)
        elif opt ==  '-i':  motiffile = value  # AD added this option to ACTUALLY supply the tamo motif file at the command-line.  The code to deal with motiffiles already existed. There was just no code for User to supply one.
        
    probes = Fasta.load(fastafile)
    
    if motiffile:
        for f in motiffile.split(','):      # AD added this to allow supplying multiple tamo files at the prompt like you can supply multiple motifs
            motifs.extend(MotifTools.load(f))
    if ambigs:
        for ambig in ambigs:
            motifs.append( MotifTools.Motif_from_text(ambig,0.1) )
    if not motifnums:  motifnums = range(len(motifs))
    print '# %d: %s'%(len(motifs),motifnums)
    for i in range(len(motifnums)):
        motif = motifs[motifnums[i]]
        if labels and i < len(labels):
            txt = labels[i]
        else:
            txt = '%d'%i
        print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh)

    probehits = {}
    for key in probes.keys():
        hits_by_motif = []
        save_flag     = 0
        if re.search('[BDHU]',probes[key]): continue
        for num in motifnums:
            result = motifs[num].scan(probes[key],thresh*motif.maxscore)
            if result[0]:
                hits_by_motif.append(result)
                save_flag = 1
            else:
                hits_by_motif.append(None)
        if save_flag:
            probehits[key]=hits_by_motif

    #scale   = .1
    maxw = 40
    for key in probehits.keys():
        l       = len(probes[key])
        a       = list('-'* int(scale*l) )
        a.extend( list(' '*10 ) )
        desc    = []
        matches = probehits[key]
        for i in range(len(matches)):
            if matches[i]:
                subseqs,endpoints,scores = matches[i]
                for idx in range(len(subseqs)):
                    start,stop = endpoints[idx]
                    subseq     = subseqs[idx]
                    score      = scores[idx]
                    if labels and (i<len(labels)): ID = labels[i]
                    else                         : ID = '%d'%i
                    desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score))
                    start = int(start*scale)
                    for offset in range(10):
                        if a[start+offset] == '-':
                            if labels and (i < len(labels)):
                                a[start+offset] = labels[i]
                            else:
                                a[start+offset] = '%d'%i
                            break
        print '%-14s %s'%(key,''.join(a)),
        print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc])
        
    print
    print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
Exemple #45
0
def main():
    if len(sys.argv) < 2:
        print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0]))
        print "Options include:"
        print ""
        print " EM Parameters:"
        print "                  -beta    [0.01]   Beta for pseudocounts"
        print "                  -seedbeta[0.02]   Beta for pseudocounts for seeds from text"
        print "                  -gamma   [0.2]    Gamma (fraction of sequences)"
        print "                  -delta   [0.001]  Convergence criteria"
        print " "
        print " Seeds (not actually proper priors)"
        print "                  -prior            Seqences or motifs for seeds (may be repeated)"
        print "                  -top N   [0]      Include w-mers in top N probes"
        print "                  -gap    string    sample gapped motifs"
#       print "                  -TF               Seed with (all) TRANSFAC PSSMs (buggy)"
        print "                  -kmerseeds        Use kmers with best enrichment score as seeds for EM"
        print "                  -pad              add NN..NN to seed"
        print " "
        print " Genome / Background model "
        print "                  -human (250,1000) Use Human Background model"
        print "                  -g genome.fsa     Use specicied Fasta file as background (searches first for matching frequency file)"
#       print "                  -Y2K, -Y5C        Use Yeast Upstream Intergenic regions (2000, 500)"
#       print "                  -B                Use Bacterial Orfs"
        print " " 
        print "Examples:"
        print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC"
        print 
        print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with Enriched seeds in CUP9.info, with"
        print "   Gamma expectation of 50% of all probes"
        print 
        print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 0th motif of the file MCM1_5.tamo"
        print "   as a seed"
        print 
        sys.exit(1)
    fastafile = sys.argv[1]

    #Echo the command line
    print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv))

    if sys.argv[2].isdigit():
        width = sys.argv[2]
    else: width = None
    
    algorithm = ''
    beta      = ''
    seedbeta  = ''
    deltamin  = ''
    gamma     = 0.2
    infofile  = ''
    seedmodels= []
    species   = 'YEAST'
    valid_tfs = [] #NOT USED
    gapped_syl= None
    gapflank  = 0
    gapweight = 0.2
    enrichfact= 0.7
    pmax      = 0  #False
    TFSEEDS   = 0
    TFMids    = []
    pad       = None
    bgfile    = None

    seed_count = 0   #Default: Take the top 0
    seed_s     = []  #Initialize seq array

    '''Parse command-line arguments'''
    for tok,i in zip(sys.argv,xrange(len(sys.argv))):
        if   tok == '-top'   :   seed_count = int(sys.argv[i+1])
        elif tok == '-greedy':   algorithm  = "GREEDY"
        elif tok == '-prior' :   seed_s.append(sys.argv[i+1])
        elif tok == '-beta'  :   beta       = float(sys.argv[i+1])
        elif tok == '-seedbeta': seedbeta   = float(sys.argv[i+1])
        elif tok == '-gamma' :   gamma      = float(sys.argv[i+1])
        elif tok == '-delta' :   deltamin   = float(sys.argv[i+1])
        elif tok == '-kmerseeds' :   infofile   = 1
        elif tok == '-valid' :   valid_tfs.append(sys.argv[i+1]) #NOT USED
        elif tok == '-w'     :   width      = sys.argv[i+1]
        elif tok == '-width' :   width      = sys.argv[i+1]
        elif tok == '-gap'   :   gapped_syl = sys.argv[i+1]
        elif tok == '-gapflank' :gapflank   = int(sys.argv[i+1])
        elif tok == '-gapweight':gapweight  = float(sys.argv[i+1])
        elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1])
        elif tok == '-pmax'  :   pmax       = 1
        elif tok == '-Y2K'   :   species    = "YEAST_2000_UP"
        elif tok == '-Y5C'   :   species    = "YEAST_500_UP"
        elif tok == '-B'     :   species    = "BAC_ORF"
        elif tok == '-Ch22'  :   species    = "Ch22"
        elif tok == '-genome':   species    = sys.argv[i+1]
        elif tok == '-pad'   :   pad        = "TRUE"
        elif tok == '-bgfile':   bgfile     = sys.argv[i+1]
        elif tok == '-TF'    :  #NOT USED (TRANSFAC NOT SUPPLIED WITH DISTRIBUTION)
            TFSEEDS = 1
            for j in range(i+1,len(sys.argv)):
                if re.match('M0',sys.argv[j]):
                    TFMids.append(sys.argv[j])
                else:
                    break
        elif tok == '-human' :
            _s = ''
            if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1]
            else:                       _s = ''
            species    = 'HUMAN'+_s

    if infofile: infofile = fastafile

    if bgfile:
        EM.loadMarkovBackground(bgfile)
    elif not ('-random_background' in sys.argv or '-nomarkov' in sys.argv):
        EM.loadMarkovBackground(species)
    else:
        EM.theMarkovBackground = EM.Zeroth()

    fsaD     = Fasta.load(fastafile)
    Fasta.delN(fsaD)
    seqs     = fsaD.values()
    probes   = fsaD.keys()
    all_seqs = seqs
    seed_s.extend(seqs[0:min(seed_count,len(seqs))])

    if infofile and width=='info':
        width = info2width(infofile)
    elif width != None:
        width = int(width)

    #Alternate source of seeds
    if infofile:
        if 1 or width:
            seedmodels.extend(info2seeds(width,infofile,fastafile,species))
        else:
            print 'Error: need to specify motif width w/ .info file'
    
    #Any -prior pointers to motifs in other files?
    (seed_s, motifs) = parse_priors(seed_s)
    seedmodels.extend(motifs)

    #Should we get seeds from TRANSFAC?
    if TFSEEDS: #NOT USED
        tf = []
        D  = tfmats()
        if not TFMids:
            keys = D.keys()
        else:
            keys = []
            for TFMid in TFMids:
                for key in D.keys():
                    if key[0:6] == TFMid:
                        keys.append(key)
                        break
        for key in keys:
            m = D[key]
            m.seednum = int(re.sub('M0*','',key.split()[0]))
            m.seedtxt = '%-24s %s'%(m,key)
            tf.append(m)
        tf.sort(lambda x,y: cmp(x.seednum,y.seednum))
        seedmodels.extend(tf)
        #seedmodels.append(tf[33])

    if gapped_syl:
        gapped_priors = gapped_motifs(gapped_syl)
        gapped_priors = map(lambda x:'N'+x+'N', gapped_priors)
        seed_s.extend(gapped_priors)

    if pad:
        print '# Padding models with NN-m-NN'
        newmodels = []
        left  = MotifTools.Motif_from_text('@')
        right = MotifTools.Motif_from_text('N')
        for m in seedmodels:
            newmodels.append(left + m + right)
            print left + m + right
        seedmodels = newmodels

    '''
    Set everything up and GO!!
    '''
    global theEM
    theEM = EM.EM(seed_s,[],width,"VERBOSE")
    if beta:     theEM.beta     = beta
    if deltamin: theEM.deltamin = deltamin
    if seedbeta: theEM.seedbeta = seedbeta
    theEM.param['gamma']        = gamma
    theEM.seqs.extend(all_seqs)
    theEM.models    = seedmodels
    theEM.gapflank  = gapflank
    theEM.gapweight = gapweight
    theEM.report()
    theEM.EM_Cstart()    #GO!!

    #print "#Sorting candidates"
    #sys.stdout.flush()
    #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP))


    '''
    Compute some metrics
    '''
    print "#Loading Genome %s"%species ; sys.stdout.flush()
    Genome = ProbeSet(species,enrichfact)
    ids    = Genome.ids_from_file(fastafile)
    
    for C in theEM.candidates:
        if not pmax:
            C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose')
            C.pssm.church = Genome.church(C.pssm,ids)
            C.pssm.frac   = Genome.frac(C.pssm,probes,None,0.7)
        else:
            (p,frac) = Genome.best_p_value(C.pssm,ids)
            C.pssm.pvalue    = p
            C.pssm.threshold = frac * C.pssm.maxscore
            print "Bests:",p,frac

        matching             = Genome.matching_ids(C.pssm,[],factor=0.7)
        matchbound           = [x for x in matching if x in probes]
        C.pssm.numbound      = len(probes)
        C.pssm.nummotif      = len(matching)
        C.pssm.numboundmotif = len(matchbound)
        sys.stdout.flush()

    
    '''
    Print out all motifs (sorted by Enrichment) in an AlignACE-like form
    '''

    theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue))
    for C,i in zip(theEM.candidates,range(len(theEM.candidates))):
        C.pssm.maxscore = -100  #May have side effects.  Recompute when done
        if C.pssm.valid:  #NOT USED
            _t = C.pssm.valid
            if not _t[0]:
                vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
            else:
                vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
        else:
            vstring = ''
        C.pssm._maxscore()     #Recomputed

        MotifTools.print_motif(C.pssm,20,i)
        sys.stdout.flush()
        continue
    
        #Antiquated stuff  -- Remove !!
        print "Log-odds matrix for Motif %3d %s"%(i,C)
        C.pssm._print_ll()
        print "Sequence Logo"
        C.pssm._print_bits()
        flush()
        #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9))
        flush()
        m = C.pssm
        if not m.__dict__.has_key('gamma'):  m.gamma = None #Kludge to deal w/ old shelves
        if m.seedtxt:     print "Seed: %3d %s"%(i,m.seedtxt)
        if m.source:      print "Source: ",m.source
        if m.gamma:       print "Gamma: %7.5f"%m.gamma
        if m.threshold:   print "Threshold: %5.2f"%m.threshold
        #if C.pssm.seedtxt:
        #    print 'Seed  %3d %-25s'%(i,C.pssm.seedtxt)
        if C.pssm.church != None: vstring = 'ch: %5.2f  %s'%(
            math.fabs(math.log(C.pssm.church)/math.log(10)), vstring)
        print "Motif %3d %-25s  nlog(p): %6.3f  %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring)
        if C.pssm.threshold:
            print "Threshold: %6.3f  %4.1f%%"%(
                C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore)
            

        C.pssm.maxscore = -1e100  #May have side effects.  Recompute when done
        for seq in C.wmers:
            print seq,i,C.pssm.scan(seq)[2][0]
        C.pssm._maxscore()      #Recomputed
        print '*'*len(seq)
        print "MAP Score: %f"%C.MAP
        sys.stdout.flush()
    sys.stdout.flush()
    sys.exit(0) #Avoid ridiculous python cleanup times