Esempio n. 1
0
def SGDData():
    root    = TAMO.paths.SGDdir
    urlroot = 'ftp://genome-ftp.stanford.edu/pub/yeast/data_download/' 
    files = ['chromosomal_feature/SGD_features.tab',
             'chromosomal_feature/dbxref.tab',
             'chromosomal_feature/chromosome_length.tab',
             'sequence/GenBank/yeast_nrpep.fasta.gz',
             'sequence/genomic_sequence/orf_protein/orf_trans_all.fasta.gz',
             ('http://yeastgfp.ucsf.edu/allOrfData.txt','Huh_Nature_2003.tab')
             ]

    chrs = '01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt'.split()

    files.extend( ['sequence/NCBI_genome_source/chr%s.fsa'%x for x in chrs] )

    downloadfiles(root,urlroot,files)

    from TAMO.seq import Fasta
    
    print "Assembling yeast genome sequence files into a single file (NCBI_yeast_genome.fsa)"
    D = {}
    for chr in chrs:
        _d = Fasta.load('%s/chr%s.fsa'%(TAMO.paths.SGDdir,chr))
        id, seq = _d.items()[0]
        if chr[0] == '0': chr = chr[1]
        D['chr%s  %s'%(chr,id)] = seq
    Fasta.write(D, TAMO.paths.SGDdir + 'NCBI_yeast_genome.fsa')
Esempio n. 2
0
def main(fastafile, outDirectory):  # !! 1/2/09 AD added 'fastafile' var and changed 'if __name__' as way to call this from script.
    seqsD = Fasta.load(fastafile)
    seqs  = seqsD.values()
    
    output = []
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1 #Pseudo count
            total = total + 1
        for nmer,count in nmersT[:]:
            try: 
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        output.append("# freq in %s (total %d with pseudocounts)\n"%(fastafile.split('/')[-1],total))  # AD 02-27-09 added a '\n' to make file look right
        for nmer in _t:
            output.append( "%-7s %20.17f\n"%(nmer,float(nmersD[nmer]) / total))  # AD 02-27-09 added a '\n' to make file look right
        
        # open output file and write out results
        outFile = '%s/%s.freq' % (outDirectory, fastafile.split('/')[-1])
        outFile = open(outFile, 'w')
        for index in output:
            outFile.write(index)
Esempio n. 3
0
def main():
    seqsD = Fasta.load(sys.argv[1])
    seqs  = seqsD.values()
    for w in range(1,7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w,seqs,'with counts','purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1 #Pseudo count
            total = total + 1
        for nmer,count in nmersT[:]:
            try: 
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc]   = nmersD[rc]   + count
                total = total + 2*count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        print "# freq in %s (total %d with pseudocounts)"%(sys.argv[1],total)
        for nmer in _t:
            print "%-7s %20.17f"%(nmer,float(nmersD[nmer]) / total)
        sys.stdout.flush()
Esempio n. 4
0
def SGDData():
    root    = TAMO.paths.SGDdir
    urlroot = 'ftp://genome-ftp.stanford.edu/pub/yeast/data_download/' 
    files = ['chromosomal_feature/SGD_features.tab',
             'chromosomal_feature/dbxref.tab',
             'chromosomal_feature/chromosome_length.tab',
             'sequence/GenBank/yeast_nrpep.fasta.gz',
             'sequence/genomic_sequence/orf_protein/orf_trans_all.fasta.gz',
             ('http://yeastgfp.ucsf.edu/allOrfData.txt','Huh_Nature_2003.tab')
             ]

    chrs = '01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt'.split()

    files.extend( ['sequence/NCBI_genome_source/chr%s.fsa'%x for x in chrs] )

    downloadfiles(root,urlroot,files)

    from TAMO.seq import Fasta
    
    print "Assembling yeast genome sequence files into a single file (NCBI_yeast_genome.fsa)"
    D = {}
    for chr in chrs:
        _d = Fasta.load('%s/chr%s.fsa'%(TAMO.paths.SGDdir,chr))
        id, seq = _d.items()[0]
        if chr[0] == '0': chr = chr[1]
        D['chr%s  %s'%(chr,id)] = seq
    Fasta.write(D, TAMO.paths.SGDdir + 'NCBI_yeast_genome.fsa')
Esempio n. 5
0
def loadMiRNAs(miRNA_Path):
    """
    Takes fasta file of mature miRNAs.
    Returns dict.
    """
    
    return Fasta.load(miRNA_Path)
Esempio n. 6
0
def main():
    seqsD = Fasta.load(sys.argv[1])
    seqs = seqsD.values()
    for w in range(1, 7):
        allnmers = permute(w)
        nmersT = MotifTools.top_nmers(w, seqs, 'with counts', 'purge Ns')
        nmersD = {}
        total = 0
        for nmer in allnmers:
            nmersD[nmer] = 1  #Pseudo count
            total = total + 1
        for nmer, count in nmersT[:]:
            try:
                rc = MotifTools.revcomplement(nmer)
                nmersD[nmer] = nmersD[nmer] + count
                nmersD[rc] = nmersD[rc] + count
                total = total + 2 * count
            except KeyError:
                pass
        _t = nmersD.keys()
        _t.sort()
        print "# freq in %s (total %d with pseudocounts)" % (sys.argv[1],
                                                             total)
        for nmer in _t:
            print "%-7s %20.17f" % (nmer, float(nmersD[nmer]) / total)
        sys.stdout.flush()
Esempio n. 7
0
def calcStats(fastaPath):
    seqFile = Fasta.load(fastaPath)
    combinedSeq = ''
    
    for each in seqFile:
        combinedSeq += seqFile[each]
    
    combinedSeq= combinedSeq.upper()
    
    seqs       = len(seqFile)
    totNucs    = len(combinedSeq)
    aCnt       = combinedSeq.count('A')
    cCnt       = combinedSeq.count('C')
    gCnt       = combinedSeq.count('G')
    tCnt       = combinedSeq.count('T')
    nCnt       = combinedSeq.count('N')
    nonNs      = aCnt+cCnt+gCnt+tCnt
    n2tot      = float(nCnt)/len(combinedSeq)
    n2nonN     = float(nCnt)/nonNs
    percentGC  = (float(gCnt)+cCnt)/nonNs
    
    
    
    return {'seqLen':seqs,
            'totNucs':totNucs,
            'aCnt':aCnt,
            'cCnt':cCnt,
            'gCnt':gCnt,
            'tCnt':tCnt,
            'nCnt':nCnt,
            'nonNs':nonNs,
            'n2tot':n2tot,
            'n2nonN':n2nonN,
            'percentGC':percentGC}
Esempio n. 8
0
def orf2pseq(orf):
    global _orfpseqs
    if not _orfpseqs:
        from TAMO.seq import Fasta
        _orfpseqs = Fasta.load(_ORFPSEQS)
        for _orf, pseq in _orfpseqs.items():
            if pseq[-1] == '*': _orfpseqs[_orf] = pseq[:-1]
    if _orfpseqs.has_key(orf): return _orfpseqs[orf]
    else: return ''
Esempio n. 9
0
def orf2pseq(orf):
    global _orfpseqs
    if not _orfpseqs:
        from TAMO.seq import Fasta
        _orfpseqs = Fasta.load(_ORFPSEQS)
        for _orf,pseq in _orfpseqs.items():
            if pseq[-1] == '*': _orfpseqs[_orf] = pseq[:-1]
    if _orfpseqs.has_key(orf): return _orfpseqs[orf]
    else:                      return ''
Esempio n. 10
0
def genomebg(infile,outfile):
    EXE = MDSCAN_DIR + 'genomebg.linux'
    fsaD   = Fasta.load(infile)
    tmpfsa = tempfile.mktemp()
    Fasta.write(fsaD,tmpfsa,linelen=1000000000)
    CMD = '%s -i %s -o %s'%(EXE,tmpfsa,outfile)
    FID = os.popen('( %s ;) 2>&1'%CMD,'r')
    for line in FID.readlines(): print line
    if FID.close(): print "Exited"
    os.unlink(tmpfsa)
Esempio n. 11
0
def genomebg(infile, outfile):
    EXE = MDSCAN_DIR + 'genomebg.linux'
    fsaD = Fasta.load(infile)
    tmpfsa = tempfile.mktemp()
    Fasta.write(fsaD, tmpfsa, linelen=1000000000)
    CMD = '%s -i %s -o %s' % (EXE, tmpfsa, outfile)
    FID = os.popen('( %s ;) 2>&1' % CMD, 'r')
    for line in FID.readlines():
        print line
    if FID.close(): print "Exited"
    os.unlink(tmpfsa)
Esempio n. 12
0
    def __init__(self, fg_file, bg_file, cv_level, markov_file):
        self.cv_level = cv_level
        self.randomize = 0
        self.beta = 0.0
        self.delta = 0.001
        self.refine = 1
        self.motif_file = 'dummy.out'
        self.dump = 0
        self.family = ''
        self.datafiles = (fg_file,bg_file)
        
        MAX_FG = 2000
        
        #LOAD MARKOV BACKGROUND#
        print "Loading Markov background file from %s"%markov_file
        EM.loadMarkovBackground(markov_file)    

        ##################################################################################
        #divide input sequences into groups according to the desired cross-validation level
        ###################################################################################
        print "Processing input sequences...."
        self.fg_seqs = Fasta.load(fg_file)   #load foreground sequences
        for key in self.fg_seqs.keys():
            fseq = self.fg_seqs[key]
            self.fg_seqs[key] = fseq.split()[0]
        self.all_probes = Fasta.load(bg_file)   #load background sequences
        Fasta.delN(self.fg_seqs)
        Fasta.delN(self.all_probes)

        #first delete any sequences from background that are present in foreground
        for key in self.fg_seqs.keys():
            if (self.all_probes.has_key(key)):
                del self.all_probes[key]

        for key in self.all_probes.keys():
            if ((len(self.all_probes[key])==0) or (re.search('[SWMKRY]', self.all_probes[key]))):
                del self.all_probes[key]
                print "deleting %s"%key
                
        while (len(self.fg_seqs.keys())>MAX_FG):
            del self.fg_seqs[self.fg_seqs.keys()[random.randint(0,(len(self.fg_seqs.keys())-1))]]
Esempio n. 13
0
def loadSeqs(fastaPathList):
    """
    Takes list of paths.  Returns single dict full of seqs found in the files.
    Converts softMasking to hard.
    """
    rDict = {}
    
    for path in fastaPathList:
        rDict.update(Fasta.load(path))
    
    bioDefs.softMaskDict2HardMask(rDict)
    return rDict
Esempio n. 14
0
def memefiles2tamo(files, tamoname):
    global probefile, PROBESET, fsafile
    
    motifs = []
    for filename in files:
        print ">>>SDFSD>F ",filename
        if   re.search('\.ace$',filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile: mdobject.fastafile=filename.replace('.ace','.fsa')
        elif re.search('\.meme.*$',filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:
                mdobject.fastafile=re.sub('\..\.meme','.meme',filename).replace('.meme','.fsa')
        motifs.extend(mdobject.motifs)

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    if fsafile: fsaname = fsafile
    else:       fsaname = Fasta.find(mdobject.fastafile)
    fsaD    = Fasta.load(fsaname)
    probes  = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key,seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v')
        if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v')
        #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v')
        #if motif.MNCP   == None: motif.MNCP   = PROBESET.MNCP(motif,probes,'v')
        if motif.frac   == None: motif.frac   = PROBESET.frac(motif,probes,'v',0.7)
        if re.search('\.meme$',filename):
            motif.MAP = -math.log(motif.evalue)/math.log(10)
        if 0 and (motif.CRA == None):
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except: pass

    if re.search('\.meme$',filename):
        mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church))

    MotifTools.save_motifs(motifs,tamoname)
Esempio n. 15
0
def get_seq(chr,start=None,stop=None):
    global ChrD
    if not ChrD:
        from TAMO.seq import Fasta
        ChrD = Fasta.load(SGDdir + 'NCBI_yeast_genome.fsa')
    if (type(chr) != type('')) or (chr.find('chr') != 0):  # 1 -> chr1, 'X' -> chrX
        chr = 'chr%s'%chr
    if (start == None) and chr.find(':') > 0:                  # chr4:454-465 -> chr4, 454, 465
        _chr,_range = chr.split(':')
        chr = _chr
        start, end = _range.split('-')
        start, end = int(start), int(end)
    return ChrD[chr][start-1:end]
Esempio n. 16
0
def get_seq(chr, start=None, stop=None):
    global ChrD
    if not ChrD:
        from TAMO.seq import Fasta
        ChrD = Fasta.load(SGDdir + 'NCBI_yeast_genome.fsa')
    if (type(chr) != type('')) or (chr.find('chr') !=
                                   0):  # 1 -> chr1, 'X' -> chrX
        chr = 'chr%s' % chr
    if (start == None) and chr.find(':') > 0:  # chr4:454-465 -> chr4, 454, 465
        _chr, _range = chr.split(':')
        chr = _chr
        start, end = _range.split('-')
        start, end = int(start), int(end)
    return ChrD[chr][start - 1:end]
Esempio n. 17
0
def LoadDNA(verbose=False):
	###############################################################################
	#
	#	Read DNA seqeuence
	#	Extract sub-sequence to model
	#	Define rules for DNA
	#
	###############################################################################
	START_POS = 0
	dna = ""
	fastafile = params.GetString(DNA_section,"FILE")
	if (fastafile):
		chromo = params.GetString(DNA_section,"CHR")
		chr_start  = params.GetInt(DNA_section,"START")
		chr_end    = params.GetInt(DNA_section,"END")
		if (not chr_end):
			chr_end = params.GetInt(DNA_section,"LENGTH")
			chr_end += chr_start
		if verbose:
			print ("Loading fasta: [%s]\n"%fastafile)

		seqs = Fasta.load(fastafile)

		seqkeys = seqs.keys()
		seqkeys.sort()

		n = 0
		for chr in seqkeys:
			n += len(seqs[chr])
		if verbose:
			print("Genome length = %d, # chromosomes = %d\n"%(n, len(seqkeys)))

		if (seqs.has_key(chromo)):
			seq = seqs[chromo]
			if verbose:
				print("Chr[%s] = %d nt\n"%(chromo,len(seq)))
			dna = seq[chr_start:chr_end]
			if verbose:
				print("DNA[%d:%d] = %d nt\n"%(chr_start,chr_end,len(dna)))
		else:
			if verbose:
				print("Cannot find [%s] chromosome in %s\n"%(chromo, filename))
		if (verbose):
			print("DNA:[%s]\n"%dna)

	return dna
Esempio n. 18
0
def LoadDNA():
    ###############################################################################
    #
    #	Read DNA seqeuence
    #	Extract sub-sequence to model
    #	Define rules for DNA
    #
    ###############################################################################
    START_POS = 0
    dna = ""
    fastafile = params.GetString(DNA_section, "FILE")
    if (fastafile):
        chromo = params.GetString(DNA_section, "CHR")
        chr_start = params.GetInt(DNA_section, "START")
        chr_end = params.GetInt(DNA_section, "END")
        if (not chr_end):
            chr_end = params.GetInt(DNA_section, "LENGTH")
            chr_end += chr_start

        print("Loading fasta: [%s]\n" % fastafile)

        seqs = Fasta.load(fastafile)

        seqkeys = seqs.keys()
        seqkeys.sort()

        n = 0
        for chr in seqkeys:
            n += len(seqs[chr])

        print("Genome length = %d, # chromosomes = %d\n" % (n, len(seqkeys)))

        if (seqs.has_key(chromo)):
            seq = seqs[chromo]
            print("Chr[%s] = %d nt\n" % (chromo, len(seq)))
            dna = seq[chr_start:chr_end]
            print("DNA[%d:%d] = %d nt\n" % (chr_start, chr_end, len(dna)))
        else:
            print("Cannot find [%s] chromosome in %s\n" % (chromo, filename))
        print("DNA:[%s]\n" % dna)

    return dna
Esempio n. 19
0
def swp_find_and_format(swp):
    global _swp_seqs
    if not _swp_seqs:
        _swp_seqs = Fasta.load(_SWPFASTA,key_func=lambda x:x)
    hits = []
    for key in _swp_seqs.keys():
        if key[0:60].find(swp) >= 0:
            hits.append(key)
    if not hits:
        return None
    if len(hits) > 1:
        print "# Multiple matches found for %s:"%swp
        for hit in hits: print '#',hit
        return None
    hit = hits[0]
    seq = _swp_seqs[hit]
    txt = ''
    for i in range(0,len(seq),70):
        txt = txt + seq[i:i+70] + '\n'
    return txt
Esempio n. 20
0
def swp_find_and_format(swp):
    global _swp_seqs
    if not _swp_seqs:
        _swp_seqs = Fasta.load(_SWPFASTA, key_func=lambda x: x)
    hits = []
    for key in _swp_seqs.keys():
        if key[0:60].find(swp) >= 0:
            hits.append(key)
    if not hits:
        return None
    if len(hits) > 1:
        print "# Multiple matches found for %s:" % swp
        for hit in hits:
            print '#', hit
        return None
    hit = hits[0]
    seq = _swp_seqs[hit]
    txt = ''
    for i in range(0, len(seq), 70):
        txt = txt + seq[i:i + 70] + '\n'
    return txt
Esempio n. 21
0
 def __init__(self,fastaSeqs, motifDict, thresh=0.5,window=200):
     
     self.seqMaps = {}
     
     # Get seqs from fasta
     assert type(fastaSeqs) == type('string') or type(fastaSeqs) == type({}),\
            'MapLib arg(fastaSeqs) must be string pointing to file or a seqDict.'
     if type(fastaSeqs) == type('string'):
         seqs = Fasta.load(fastaSeqs)
     elif type(fastaSeqs) == type({}):
         seqs = fastaSeqs
     
     # Instantiate a SeqMap obj for each seq in seqs
     c = 0
     for k in seqs:
         c += 1
         assert c <= 250
         realT1 = time()
         self.seqMaps[k] = SeqMap(k, seqs[k], motifDict, thresh=thresh, window=window)
         realT2 = time()
         print '%.4f\t%s' % (realT2-realT1,c)
Esempio n. 22
0
def geneList2FastaDict(geneList, sourceFastaPath, hardMasked=True):
    """
    Returns a Dict of requested fasta recs in form SeqName:Sequence.
    Defaults to HardMasked return seqeunces.
    """
    
    sourceDict = Fasta.load(sourceFastaPath)
    
    # make new dict of all genes both in geneList AND sourceDict
    # new dict may be shorter than geneList!!!!!!
    
    newDict = {}
    for i in geneList:
        if sourceDict[i]:
            newDict[i] = sourceDict[i]
            
    print "%s genes names given, %s found." % (len(geneList), len(newDict))
    
    if hardMasked:
        softMaskDict2HardMask(newDict)
    
    return newDict
Esempio n. 23
0
def motif_matrix(fsa,motif,outfile,genome='mm9'):
    if genome=='hg18': markov="/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov"
    else: markov="/nfs/data/cwng/chipseq/hypotheses/Mouse.markov"

    #Load motif and background adjust PSSM
    m=MotifTools.load(motif)
    EM.loadMarkovBackground(markov)
    bg = EM.theMarkovBackground.zeroth()
    F=Fasta.load(fsa,key_func=lambda x:x)
    seqs=F.values()
    n_seqs=len(seqs)
    n_motifs=len(m)
    SCORES=np.zeros((n_motifs,n_seqs),dtype='float')
    #SHIFTS=np.zeros((n_motifs,n_seqs))
    
    #out=open(outfile,'w')
    for i,M in enumerate(m):
        ll = M.logP
        EM.loadMarkovBackground(markov)
        bg = EM.theMarkovBackground.zeroth()
        for pos in ll:
            for letter in pos.keys():
                pos[letter] = pos[letter] - math.log(bg[letter])/math.log(2.0)
        AM = MotifTools.Motif_from_ll(ll)
        #adj_model = MotifTools.Motif_from_ll(ll)
        #adj_model.source = M.source
        #pssm = MDsupport.Motif2c_PSSM(adj_model)
        #w=pssm.width

        #shift=[]
        #scores=[]
        mi,ma=AM.minscore,AM.maxscore

        #F_m={}
        #Search every seq for given motif above threshold t and print motif centered results
        for j,seq in enumerate(seqs):
            seq_fwd = seq.upper()
            #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1]
            #scores_fwd = pssm.score_probe(seq_fwd)
            #scores_rev = pssm.score_probe(seq_rev)
            #max_score=mi
            #max_ind=0
            #for ind,s in enumerate(scores_fwd):
            #    if s> max_score:    
            #        max_score=s
            #        max_ind=ind
            #        strand='+'
            #for ind,s in enumerate(scores_rev):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='-'
            max_score=AM.bestscore(seq_fwd)
            mscore=(max_score-mi)/(ma-mi)
            #orig=len(seq_fwd)/2
            #bind=max_ind+w//2
            #d=abs(orig-bind)
            SCORES[i,j]=mscore
            #SHIFTS[i,j]=d
            #out.write('%1.3f\t'%mscore)
        #out.write('\n')
    #out.close()
    #del F
    np.savetxt(outfile,SCORES,fmt='%1.3f')
Esempio n. 24
0
originalFastaDict = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Aedes/aedes2KBupStreamTSS.softMasked.geneStrand.fas'

desiredFastaList  = open('/Users/biggus/Documents/James/Collaborations/Campbell/data/CCupAt4Days.genes.txt', 'rU').readlines()

outFile           = '/Users/biggus/Documents/James/Collaborations/Campbell/data/CCupAt4Days.UNmasked.fas'

hardMask          = None

#==========================================================================

#  Strip newlines from fasta ID list
desiredFastaList = map(string.strip, desiredFastaList)


#  Instantiate the fasta rec lists
originalFastaDict = Fasta.load(originalFastaDict)

#  New dict to catch copied seqObjs
desiredFastaDict = {}

for rec in desiredFastaList:
    if originalFastaDict.has_key(rec):    
        desiredFastaDict[rec] = originalFastaDict[rec]
    else:
        print rec+' not found in source fasta list!'
    

# Hard Mask if requested
if hardMask:
    for x in desiredFastaDict:
        desiredFastaDict[x] = desiredFastaDict[x].replace('a','N')
Esempio n. 25
0
def info2seeds(N, infofile, probefile, species='YEAST'):
    if species == 'human':
        species = 'HUMAN'
    G = ProbeSet(species)
    IDs = G.ids_from_file(probefile)
    Q = EM.theMarkovBackground.zeroth()
    seqs = []

    if re.search('.info$', infofile):
        #I    = infoana.Infofile(infofile,'DONT REMOVE QUERY')
        I = infoana.Infofile(infofile)
        print "# Loading infofile: %s" % infofile
        print I
        seqs = map(lambda x: 'NNNN%sNNNN' % x, I.bsites2seqs(50.0))
    elif re.search('.fsa$', infofile):
        fsaDict = Fasta.load(infofile)
        probes = fsaDict.keys()
        #sequence_repository = KenzieSequences()
        cons_pickle = infofile.split('.')[0] + '.cpickle'
        try:
            CFH = open(cons_pickle, 'r')
            ConsDict = pickle.load(CFH)
            CFH.close()
        except:
            ConsDict = {}
            for probe in probes:
                seqs = []
                cons = []
                try:
                    seq_list = G.alignments[probe]
                except:
                    continue
                if (seq_list != []):
                    cer_seq = seq_list[0][1]
                else:
                    cer_seq = ''
                cer_seq = cer_seq.upper()
                numg = len(seq_list) - 1
                for i in range(1, 4):
                    try:
                        seqs.append(seq_list[i][1].upper())
                    except:
                        seqs.append('')
                    cons.append([])
                for position in range(len(cer_seq)):
                    ref = cer_seq[position]
                    for i in range(3):
                        if (seqs[i] == ''): continue
                        if (seqs[i][position] != ref):
                            cons[i].append(1)
                        else:
                            cons[i].append(0)
                ConsDict[probe] = cons
            CFH = open(cons_pickle, 'w')
            pickle.dump(ConsDict, CFH)
            CFH.close()

        for probe in probes:
            superseq = ''
            try:
                seq_list = G.alignments[probe]
            except:
                continue
            for seq in seq_list:
                subseq = seq[1].replace('-', '')
                subseq = subseq.replace('.', '')
                seqs.append(subseq)

    if not N:
        nmers = seqs
    else:
        if (N < 11):
            nmers = ConvergeMotifTools.top_nmers(N, seqs)
        else:
            gaplen = N - 2 * (N / 3)
            gr = ''
            for i in range(gaplen):
                gr = gr + 'N'
            nmers = ConvergeMotifTools.top_nmers(N, seqs, 0, '', 1)
            gnmers = []
            for nmer in nmers:
                gnmers.append(nmer[0:(N / 3)] + gr + nmer[(N / 3):2 * (N / 3)])
            nmers = gnmers
        if len(nmers) > 201: nmers = nmers[0:200]

    print "Scoring enrichment of %d nmers from .info file" % len(nmers)

    nmers_scoresT = []
    for nmer in nmers:
        if nmer[0:(N / 3)].isalpha():
            p = G.p_value(nmer, IDs, 'verbose')
            #if (species=='Ciona'): ng = 2
            #else: ng = 4
            #p_cons = conservation_pvalue(nmer,IDs,fsaDict,ConsDict,ng)
            #if (p_cons<0.1):
            nmers_scoresT.append((nmer, p))
    nmers_scoresT.sort(lambda x, y: cmp(x[1], y[1]))
    #for tup in nmers_scoresT:
    #    print tup
    last = min(20, len(nmers_scoresT))
    models = []
    for i in range(last):
        seq = nmers_scoresT[i][0]
        m = ConvergeMotifTools.Motif('', Q)
        m.compute_from_text(seq, 0.1)
        models.append(m)
    return (models)
Esempio n. 26
0
 def freq_from_fasta(self,fastafile):
    seqsD = Fasta.load(sys.argv[1])
    seqs  = seqsD.values()
    self.freq_from_seqs(seqs)
Esempio n. 27
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:", ["help", "output="])
    except getopt.GetoptError:
        usage()
        sys.exit(1)
    if not opts:
        usage()
        sys.exit(1)
        

    print "#" + ' '.join(sys.argv)
    fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.7)
    ambigs = []

    scale   = 50.0 / 1000.0
    
    motifs = []
    for opt, value in opts:
        #print opt, value
        if   opt == '-f':  fastafile = value
        elif opt == '-m':  motifs.extend(MotifTools.txt2motifs(value))
        elif opt == '-n':  motifnums = [int(x) for x in value.split(',')]
        elif opt == '-L':  labels    = list(value)
        elif opt == '-t':  thresh    = float(value)
        elif opt == '-a':  ambigs.extend(value.split(','))
        elif opt == '-S':  scale     = float(value)
        
    probes = Fasta.load(fastafile)
    
    if motiffile:
        motifs.extend(TAMO.tamofile2motifs(motiffile))
    if ambigs:
        for ambig in ambigs:
            motifs.append( MotifTools.Motif_from_text(ambig,0.1) )
    if not motifnums:  motifnums = range(len(motifs))
    print '# %d: %s'%(len(motifs),motifnums)
    for i in range(len(motifnums)):
        motif = motifs[motifnums[i]]
        if labels and i < len(labels):
            txt = labels[i]
        else:
            txt = '%d'%i
        print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh)

    probehits = {}
    for key in probes.keys():
        hits_by_motif = []
        save_flag     = 0
        if re.search('[BDHU]',probes[key]): continue
        for num in motifnums:
            result = motifs[num].scan(probes[key],thresh*motif.maxscore)
            if result[0]:
                hits_by_motif.append(result)
                save_flag = 1
            else:
                hits_by_motif.append(None)
        if save_flag:
            probehits[key]=hits_by_motif

    #scale   = .1
    maxw = 40
    for key in probehits.keys():
        l       = len(probes[key])
        a       = list('-'* int(scale*l) )
        a.extend( list(' '*10 ) )
        desc    = []
        matches = probehits[key]
        for i in range(len(matches)):
            if matches[i]:
                subseqs,endpoints,scores = matches[i]
                for idx in range(len(subseqs)):
                    start,stop = endpoints[idx]
                    subseq     = subseqs[idx]
                    score      = scores[idx]
                    if labels and (i<len(labels)): ID = labels[i]
                    else                         : ID = '%d'%i
                    desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score))
                    start = int(start*scale)
                    for offset in range(10):
                        if a[start+offset] == '-':
                            if labels and (i < len(labels)):
                                a[start+offset] = labels[i]
                            else:
                                a[start+offset] = '%d'%i
                            break
        print '%-14s %s'%(key,''.join(a)),
        print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc])
        
    print
    print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
Esempio n. 28
0
def main():
    if len(sys.argv) < 3:
        print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0]))
        print "Options include:"
        print "                  -valid  <tf_name> Check answers against Transfac"
        print " EM Parameters:"
        print "                  -beta    [0.01]   Beta for pseudocounts"
        print "                  -seedbeta[0.02]   Beta for pseudocounts for seeds from text"
        print "                  -gamma   [0.2]    Gamma (fraction of sequences)"
        print "                  -delta   [0.001]  Convergence criteria"
        print " "
        print " Seeds (not actually proper priors)"
        print "                  -prior            Seqences or motifs for seeds (may be repeated)"
        print "                  -top N   [0]      Include w-mers in top N probes"
        print "                  -gap    string    sample gapped motifs"
        print "                  -TF               Seed with (all) TRANSFAC PSSMs (buggy)"
        print "                  -info <file.info> for structural priors"
        print "                  -pad              add NN..NN to seed"
        print " "
        print " Genome / Background model "
        print "                  -human (250,1000) Use Human Background model"
        print "                  -Y2K, -Y5C        Use Yeast Upstream Intergenic regions (2000, 500)"
        print "                  -B                Use Bacterial Orfs"
        print " " 
        print "Examples:"
        print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC"
        print 
        print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with Enriched seeds in CUP9.info, with"
        print "   Gamma expectation of 50% of all probes"
        print 
        print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 0th motif of the file MCM1_5.tamo"
        print "   as a seed"
        print 
        sys.exit(1)
    fastafile = sys.argv[1] 

    #Echo the command line
    print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv))

    if sys.argv[2].isdigit():
        width = sys.argv[2]
    else: width = None
    
    algorithm = ''
    beta      = ''
    seedbeta  = ''
    cbeta     = ''
    deltamin  = ''
    gamma     = 0.2
    infofile  = ''
    seedmodels= []
    species   = 'YEAST'
    valid_tfs = []
    gapped_syl= None
    gapflank  = 0
    gapweight = 0.2
    enrichfact= 0.7
    pmax      = 0  #False
    TFSEEDS   = 0
    TFMids    = []
    pad       = None
    padlen    = 0
    thetas    = []
    seed_count = 0   #Default: Take the top 0
    seed_s     = []  #Initialize seq array
    sp_seed   = 0
    
    '''Parse command-line arguments'''
    for tok,i in zip(sys.argv,xrange(len(sys.argv))):
        if   tok == '-top'   :   seed_count = int(sys.argv[i+1])
        elif tok == '-greedy':   algorithm  = "GREEDY"
        elif tok == '-prior' :   seed_s.append(sys.argv[i+1])
        elif tok == 'sp'     :   sp_seed    = 1
        elif tok == '-beta'  :   beta       = float(sys.argv[i+1])
        elif tok == '-beta'  :   seedbeta   = float(sys.argv[i+1])
        elif tok == '-cbeta' :   cbeta      = float(sys.argv[i+1])
	elif tok == '-thetas':   
		for j in range(int(sys.argv[i+1])):
			thetas.append(float(sys.argv[i+j+2]))
        elif tok == '-gamma' :   gamma      = float(sys.argv[i+1])
        elif tok == '-delta' :   deltamin   = float(sys.argv[i+1])
        elif tok == '-info'  :   infofile   = sys.argv[i+1]
        elif tok == '-valid' :   valid_tfs.append(sys.argv[i+1])
        elif tok == '-w'     :   width      = sys.argv[i+1]
        elif tok == '-width' :   width      = sys.argv[i+1]
        elif tok == '-gap'   :   gapped_syl = sys.argv[i+1]
        elif tok == '-gapflank' :gapflank   = int(sys.argv[i+1])
        elif tok == '-gapweight':gapweight  = float(sys.argv[i+1])
        elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1])
        elif tok == '-pmax'  :   pmax       = 1
        elif tok == '-Y2K'   :   species    = "YEAST_2000_UP"
        elif tok == '-Y5C'   :   species    = "YEAST_500_UP"
        elif tok == '-B'     :   species    = "BAC_ORF"
        elif tok == '-Ch22'  :   species    = "Ch22"
        elif tok == '-genome':   species    = sys.argv[i+1]
        elif tok == '-pad'   :
            pad        = sys.argv[i+1]
            padlen     = sys.argv[i+2]     
        elif tok == '-TF'    :
            TFSEEDS = 1
            for j in range(i+1,len(sys.argv)):
                if re.match('M0',sys.argv[j]):
                    TFMids.append(sys.argv[j])
                else:
                    break
        elif tok == '-human' :
            _s = ''
            if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1]
            else:                       _s = ''
            species    = 'HUMAN'+_s

    seqs = []
    fsaD     = Fasta.load(fastafile)
    probes = fsaD.keys()
    '''
    for probeid in fsaD.keys():
        seqs.append  (fsaD    [probeid])
    '''
    numprobes = len(probes)
    #print "numprobes: %i"%numprobes
    if not ('-random_background' in sys.argv or '-nomarkov' in sys.argv):
        EM.loadMarkovBackground(fastafile,numprobes,species)

    #seqs     = EM.fasta2seqs(fastafile)
    all_seqs = seqs
    seed_s.extend(seqs[0:min(seed_count,len(seqs))])
    #not necessary --- seed_c.extend(c_seqs[0:min(seed_count,len(seqs))])

    if infofile and width=='info':
        width = info2width(infofile)
    elif width != None:
        width = int(width)

    #Alternate source of seeds
    if infofile:
        if 1 or width:
            seedmodels.extend(info2seeds(width,infofile,fastafile,species))
        else:
            print 'Error: need to specify motif width w/ .info file'
    
    #Any -prior pointers to motifs in other files?
    (seed_s, motifs) = parse_priors(seed_s)
    seedmodels.extend(motifs)

    #Should we get seeds from TRANSFAC?
    if TFSEEDS:
        tf = []
        D  = tfmats()
        if not TFMids:
            keys = D.keys()
        else:
            keys = []
            for TFMid in TFMids:
                for key in D.keys():
                    if key[0:6] == TFMid:
                        keys.append(key)
                        break
        for key in keys:
            m = D[key]
            m.seednum = int(re.sub('M0*','',key.split()[0]))
            m.seedtxt = '%-24s %s'%(m,key)
            tf.append(m)
        tf.sort(lambda x,y: cmp(x.seednum,y.seednum))
        seedmodels.extend(tf)
        #seedmodels.append(tf[33])

    if gapped_syl:
        gapped_priors = gapped_motifs(gapped_syl)
        gapped_priors = map(lambda x:'N'+x+'N', gapped_priors)
        seed_s.extend(gapped_priors)

    if pad:
        print '# Padding models with NN-m-NN'
        newmodels = []
        for m in seedmodels:
            newmodels.append(m[-2,m.width+2])
        seedmodels = newmodels

    '''
    Set everything up and GO!!
    '''
    global theEM
    theEM = EM.EM(seed_s,[],[],width,"VERBOSE")
    if beta:     theEM.beta     = beta
    if cbeta:    theEM.cbeta    = cbeta
    if deltamin: theEM.deltamin = deltamin
    if seedbeta: theEM.seedbeta = seedbeta
    if thetas:   theEM.thetas = thetas
    theEM.param['gamma']        = gamma
    theEM.probeids.extend(probes)
    theEM.seqs.extend(all_seqs)
    #theEM.cons_seqs.extend(c_seqs)
    theEM.models    = seedmodels
    theEM.gapflank  = gapflank
    theEM.gapweight = gapweight
    theEM.report()
    theEM.EM_Cstart()    #GO!!

    #print "#Sorting candidates"
    #sys.stdout.flush()
    #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP))

    #sys.exit(0)
    
    '''
    Compute some metrics
    '''
    print "#Loading Genome %s"%species ; sys.stdout.flush()
    if species == 'human':
	Genome = ProbeSet('HUMAN',enrichfact)
    else:
    	Genome = ProbeSet(species,enrichfact)
    ids    = Genome.ids_from_file(fastafile)

    #fsaDict = Fasta.load(fastafile)
    #probes = fsaDict.keys()
    #cons_pickle = fastafile.split('.')[0] + '.cpickle'
    for C in theEM.candidates:
        #p_cons = conservation_pvalue(C.pssm,probes,fsaDict,ConsDict,4)
        #print p_cons
        if not pmax:
            w_dict = Genome.w_dict
            for key,i in zip(w_dict.keys(),range(len(C.pssm.thetas))):
                w_dict[key] = C.pssm.thetas[i]
            Genome.w_dict = w_dict
            C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose')
            #print "P-VAL: %f"%(Genome.p_value(C.pssm,ids,'verbose')*p_cons)
            C.pssm.church = Genome.church(C.pssm,ids)
        else:
            (p,frac) = Genome.best_p_value(C.pssm,ids)
            C.pssm.pvalue    = p
            C.pssm.threshold = frac * C.pssm.maxscore
            print "Bests:",p,frac

    for valid_tf in valid_tfs:
        C.pssm.valid = Validate.validate(C.pssm,valid_tf,'Verbose',"Want Tuple")
    
    '''
    Print out all motifs (sorted by Enrichment) in an AlignACE-like form
    '''

    theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue))
    for C,i in zip(theEM.candidates,range(len(theEM.candidates))):
        C.pssm.maxscore = -100  #May have side effects.  Recompute when done
        if C.pssm.valid:
            _t = C.pssm.valid
            if not _t[0]:
                vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
            else:
                vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
        else:
            vstring = ''
        C.pssm._maxscore()     #Recomputed
        print "Log-odds matrix for Motif %3d %s"%(i,C)
        C.pssm._print_ll()
        print "Sequence Logo"
        C.pssm._print_bits()
        flush()
        #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9))
        flush()
        m = C.pssm
        if not m.__dict__.has_key('gamma'):  m.gamma = None #Kludge to deal w/ old shelves
        if m.seedtxt:     print "Seed: %3d %s"%(i,m.seedtxt)
        if m.source:      print "Source: ",m.source
        if m.gamma:       print "Gamma: %7.5f"%m.gamma
        if m.threshold:   print "Threshold: %5.2f"%m.threshold
        if m.thetas != []:
            tstr = "thetas:"
            for theta in m.thetas:
                tstr = tstr + " " + str(theta)
            print tstr                                    
        #if C.pssm.seedtxt:
        #    print 'Seed  %3d %-25s'%(i,C.pssm.seedtxt)
        if C.pssm.church != None: vstring = 'ch: %5.2f  %s'%(
            math.fabs(math.log(C.pssm.church)/math.log(10)), vstring)
        print "Motif %3d %-25s  nlog(p): %6.3f  %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring)
        if C.pssm.threshold:
            print "Threshold: %6.3f  %4.1f%%"%(
                C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore)
            

        C.pssm.maxscore = -1e100  #May have side effects.  Recompute when done
        for seq in C.wmers:
            print seq,i,C.pssm.scan(seq)[2][0]
        C.pssm._maxscore()      #Recomputed
        print '*'*len(seq)
        print "MAP Score: %f"%C.MAP
        sys.stdout.flush()
    sys.stdout.flush()
    sys.exit(0) #Avoid ridiculous python cleanup times
Esempio n. 29
0
from TAMO.seq import Fasta

fastaFile = '/Users/biggus/Documents/James/AedesPeptides/Aedes_aegypti.AaegL1.50.pep.all.fa'
outFile   = '/Users/biggus/Documents/James/AedesPeptides/Aedes_aegypti.AaegL1.50.pep.all.reformatted.fa'

fDict = Fasta.load(fastaFile, lambda x: x)

newDict = {}

for n,s in fDict.items():
    n = [n[:13]+'_Ens',n[13:]]
    n = '|'.join(n)
    n = '|'+n
    
    newDict[n] = s
    
outFile = open(outFile, 'w')
newDict_keys = newDict.keys()
newDict_keys.sort()

for key in newDict_keys:
    entry = '>%s\n%s\n' % (key,newDict[key])
    outFile.write(entry)
    
print 'Done.'    
Esempio n. 30
0
def motif_matrix(fsa, motif, outfile, genome='mm9'):
    if genome == 'hg18':
        markov = "/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov"
    else:
        markov = "/nfs/data/cwng/chipseq/hypotheses/Mouse.markov"

    #Load motif and background adjust PSSM
    m = MotifTools.load(motif)
    EM.loadMarkovBackground(markov)
    bg = EM.theMarkovBackground.zeroth()
    F = Fasta.load(fsa, key_func=lambda x: x)
    seqs = F.values()
    n_seqs = len(seqs)
    n_motifs = len(m)
    SCORES = np.zeros((n_motifs, n_seqs), dtype='float')
    #SHIFTS=np.zeros((n_motifs,n_seqs))

    #out=open(outfile,'w')
    for i, M in enumerate(m):
        ll = M.logP
        EM.loadMarkovBackground(markov)
        bg = EM.theMarkovBackground.zeroth()
        for pos in ll:
            for letter in pos.keys():
                pos[letter] = pos[letter] - math.log(
                    bg[letter]) / math.log(2.0)
        AM = MotifTools.Motif_from_ll(ll)
        #adj_model = MotifTools.Motif_from_ll(ll)
        #adj_model.source = M.source
        #pssm = MDsupport.Motif2c_PSSM(adj_model)
        #w=pssm.width

        #shift=[]
        #scores=[]
        mi, ma = AM.minscore, AM.maxscore

        #F_m={}
        #Search every seq for given motif above threshold t and print motif centered results
        for j, seq in enumerate(seqs):
            seq_fwd = seq.upper()
            #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1]
            #scores_fwd = pssm.score_probe(seq_fwd)
            #scores_rev = pssm.score_probe(seq_rev)
            #max_score=mi
            #max_ind=0
            #for ind,s in enumerate(scores_fwd):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='+'
            #for ind,s in enumerate(scores_rev):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='-'
            max_score = AM.bestscore(seq_fwd)
            mscore = (max_score - mi) / (ma - mi)
            #orig=len(seq_fwd)/2
            #bind=max_ind+w//2
            #d=abs(orig-bind)
            SCORES[i, j] = mscore
            #SHIFTS[i,j]=d
            #out.write('%1.3f\t'%mscore)
        #out.write('\n')
    #out.close()
    #del F
    np.savetxt(outfile, SCORES, fmt='%1.3f')
Esempio n. 31
0
def main():
    if len(sys.argv) < 2:
        print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0]))
        print "Options include:"
        print ""
        print " EM Parameters:"
        print "                  -beta    [0.01]   Beta for pseudocounts"
        print "                  -seedbeta[0.02]   Beta for pseudocounts for seeds from text"
        print "                  -gamma   [0.2]    Gamma (fraction of sequences)"
        print "                  -delta   [0.001]  Convergence criteria"
        print " "
        print " Seeds (not actually proper priors)"
        print "                  -prior            Seqences or motifs for seeds (may be repeated)"
        print "                  -top N   [0]      Include w-mers in top N probes"
        print "                  -gap    string    sample gapped motifs"
#       print "                  -TF               Seed with (all) TRANSFAC PSSMs (buggy)"
        print "                  -kmerseeds        Use kmers with best enrichment score as seeds for EM"
        print "                  -pad              add NN..NN to seed"
        print " "
        print " Genome / Background model "
        print "                  -human (250,1000) Use Human Background model"
        print "                  -g genome.fsa     Use specicied Fasta file as background (searches first for matching frequency file)"
#       print "                  -Y2K, -Y5C        Use Yeast Upstream Intergenic regions (2000, 500)"
#       print "                  -B                Use Bacterial Orfs"
        print " " 
        print "Examples:"
        print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC"
        print 
        print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with Enriched seeds in CUP9.info, with"
        print "   Gamma expectation of 50% of all probes"
        print 
        print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 0th motif of the file MCM1_5.tamo"
        print "   as a seed"
        print 
        sys.exit(1)
    fastafile = sys.argv[1]

    #Echo the command line
    print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv))

    if sys.argv[2].isdigit():
        width = sys.argv[2]
    else: width = None
    
    algorithm = ''
    beta      = ''
    seedbeta  = ''
    deltamin  = ''
    gamma     = 0.2
    infofile  = ''
    seedmodels= []
    species   = 'YEAST'
    valid_tfs = [] #NOT USED
    gapped_syl= None
    gapflank  = 0
    gapweight = 0.2
    enrichfact= 0.7
    pmax      = 0  #False
    TFSEEDS   = 0
    TFMids    = []
    pad       = None
    bgfile    = None

    seed_count = 0   #Default: Take the top 0
    seed_s     = []  #Initialize seq array

    '''Parse command-line arguments'''
    for tok,i in zip(sys.argv,xrange(len(sys.argv))):
        if   tok == '-top'   :   seed_count = int(sys.argv[i+1])
        elif tok == '-greedy':   algorithm  = "GREEDY"
        elif tok == '-prior' :   seed_s.append(sys.argv[i+1])
        elif tok == '-beta'  :   beta       = float(sys.argv[i+1])
        elif tok == '-seedbeta': seedbeta   = float(sys.argv[i+1])
        elif tok == '-gamma' :   gamma      = float(sys.argv[i+1])
        elif tok == '-delta' :   deltamin   = float(sys.argv[i+1])
        elif tok == '-kmerseeds' :   infofile   = 1
        elif tok == '-valid' :   valid_tfs.append(sys.argv[i+1]) #NOT USED
        elif tok == '-w'     :   width      = sys.argv[i+1]
        elif tok == '-width' :   width      = sys.argv[i+1]
        elif tok == '-gap'   :   gapped_syl = sys.argv[i+1]
        elif tok == '-gapflank' :gapflank   = int(sys.argv[i+1])
        elif tok == '-gapweight':gapweight  = float(sys.argv[i+1])
        elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1])
        elif tok == '-pmax'  :   pmax       = 1
        elif tok == '-Y2K'   :   species    = "YEAST_2000_UP"
        elif tok == '-Y5C'   :   species    = "YEAST_500_UP"
        elif tok == '-B'     :   species    = "BAC_ORF"
        elif tok == '-Ch22'  :   species    = "Ch22"
        elif tok == '-genome':   species    = sys.argv[i+1]
        elif tok == '-pad'   :   pad        = "TRUE"
        elif tok == '-bgfile':   bgfile     = sys.argv[i+1]
        elif tok == '-TF'    :  #NOT USED (TRANSFAC NOT SUPPLIED WITH DISTRIBUTION)
            TFSEEDS = 1
            for j in range(i+1,len(sys.argv)):
                if re.match('M0',sys.argv[j]):
                    TFMids.append(sys.argv[j])
                else:
                    break
        elif tok == '-human' :
            _s = ''
            if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1]
            else:                       _s = ''
            species    = 'HUMAN'+_s

    if infofile: infofile = fastafile

    if bgfile:
        EM.loadMarkovBackground(bgfile)
    elif not ('-random_background' in sys.argv or '-nomarkov' in sys.argv):
        EM.loadMarkovBackground(species)
    else:
        EM.theMarkovBackground = EM.Zeroth()

    fsaD     = Fasta.load(fastafile)
    Fasta.delN(fsaD)
    seqs     = fsaD.values()
    probes   = fsaD.keys()
    all_seqs = seqs
    seed_s.extend(seqs[0:min(seed_count,len(seqs))])

    if infofile and width=='info':
        width = info2width(infofile)
    elif width != None:
        width = int(width)

    #Alternate source of seeds
    if infofile:
        if 1 or width:
            seedmodels.extend(info2seeds(width,infofile,fastafile,species))
        else:
            print 'Error: need to specify motif width w/ .info file'
    
    #Any -prior pointers to motifs in other files?
    (seed_s, motifs) = parse_priors(seed_s)
    seedmodels.extend(motifs)

    #Should we get seeds from TRANSFAC?
    if TFSEEDS: #NOT USED
        tf = []
        D  = tfmats()
        if not TFMids:
            keys = D.keys()
        else:
            keys = []
            for TFMid in TFMids:
                for key in D.keys():
                    if key[0:6] == TFMid:
                        keys.append(key)
                        break
        for key in keys:
            m = D[key]
            m.seednum = int(re.sub('M0*','',key.split()[0]))
            m.seedtxt = '%-24s %s'%(m,key)
            tf.append(m)
        tf.sort(lambda x,y: cmp(x.seednum,y.seednum))
        seedmodels.extend(tf)
        #seedmodels.append(tf[33])

    if gapped_syl:
        gapped_priors = gapped_motifs(gapped_syl)
        gapped_priors = map(lambda x:'N'+x+'N', gapped_priors)
        seed_s.extend(gapped_priors)

    if pad:
        print '# Padding models with NN-m-NN'
        newmodels = []
        left  = MotifTools.Motif_from_text('@')
        right = MotifTools.Motif_from_text('N')
        for m in seedmodels:
            newmodels.append(left + m + right)
            print left + m + right
        seedmodels = newmodels

    '''
    Set everything up and GO!!
    '''
    global theEM
    theEM = EM.EM(seed_s,[],width,"VERBOSE")
    if beta:     theEM.beta     = beta
    if deltamin: theEM.deltamin = deltamin
    if seedbeta: theEM.seedbeta = seedbeta
    theEM.param['gamma']        = gamma
    theEM.seqs.extend(all_seqs)
    theEM.models    = seedmodels
    theEM.gapflank  = gapflank
    theEM.gapweight = gapweight
    theEM.report()
    theEM.EM_Cstart()    #GO!!

    #print "#Sorting candidates"
    #sys.stdout.flush()
    #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP))


    '''
    Compute some metrics
    '''
    print "#Loading Genome %s"%species ; sys.stdout.flush()
    Genome = ProbeSet(species,enrichfact)
    ids    = Genome.ids_from_file(fastafile)
    
    for C in theEM.candidates:
        if not pmax:
            C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose')
            C.pssm.church = Genome.church(C.pssm,ids)
            C.pssm.frac   = Genome.frac(C.pssm,probes,None,0.7)
        else:
            (p,frac) = Genome.best_p_value(C.pssm,ids)
            C.pssm.pvalue    = p
            C.pssm.threshold = frac * C.pssm.maxscore
            print "Bests:",p,frac

        matching             = Genome.matching_ids(C.pssm,[],factor=0.7)
        matchbound           = [x for x in matching if x in probes]
        C.pssm.numbound      = len(probes)
        C.pssm.nummotif      = len(matching)
        C.pssm.numboundmotif = len(matchbound)
        sys.stdout.flush()

    
    '''
    Print out all motifs (sorted by Enrichment) in an AlignACE-like form
    '''

    theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue))
    for C,i in zip(theEM.candidates,range(len(theEM.candidates))):
        C.pssm.maxscore = -100  #May have side effects.  Recompute when done
        if C.pssm.valid:  #NOT USED
            _t = C.pssm.valid
            if not _t[0]:
                vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
            else:
                vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
        else:
            vstring = ''
        C.pssm._maxscore()     #Recomputed

        MotifTools.print_motif(C.pssm,20,i)
        sys.stdout.flush()
        continue
    
        #Antiquated stuff  -- Remove !!
        print "Log-odds matrix for Motif %3d %s"%(i,C)
        C.pssm._print_ll()
        print "Sequence Logo"
        C.pssm._print_bits()
        flush()
        #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9))
        flush()
        m = C.pssm
        if not m.__dict__.has_key('gamma'):  m.gamma = None #Kludge to deal w/ old shelves
        if m.seedtxt:     print "Seed: %3d %s"%(i,m.seedtxt)
        if m.source:      print "Source: ",m.source
        if m.gamma:       print "Gamma: %7.5f"%m.gamma
        if m.threshold:   print "Threshold: %5.2f"%m.threshold
        #if C.pssm.seedtxt:
        #    print 'Seed  %3d %-25s'%(i,C.pssm.seedtxt)
        if C.pssm.church != None: vstring = 'ch: %5.2f  %s'%(
            math.fabs(math.log(C.pssm.church)/math.log(10)), vstring)
        print "Motif %3d %-25s  nlog(p): %6.3f  %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring)
        if C.pssm.threshold:
            print "Threshold: %6.3f  %4.1f%%"%(
                C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore)
            

        C.pssm.maxscore = -1e100  #May have side effects.  Recompute when done
        for seq in C.wmers:
            print seq,i,C.pssm.scan(seq)[2][0]
        C.pssm._maxscore()      #Recomputed
        print '*'*len(seq)
        print "MAP Score: %f"%C.MAP
        sys.stdout.flush()
    sys.stdout.flush()
    sys.exit(0) #Avoid ridiculous python cleanup times
Esempio n. 32
0
def memefiles2tamo(files, tamoname):
    global probefile, PROBESET

    motifs = []
    for filename in files:
        print ">>>SDFSD>F ", filename
        if re.search('\.ace$', filename):
            mdobject = AlignAce.AlignAce(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = filename.replace('.ace', '.fsa')
        elif re.search('\.meme.*$', filename):
            mdobject = Meme.Meme(filename)
            if not mdobject.fastafile:
                mdobject.fastafile = re.sub('\..\.meme', '.meme',
                                            filename).replace('.meme', '.fsa')
        motifs.extend(mdobject.motifs)

    #fsaname = find_fsa(mdobject.fastafile)
    print mdobject.fastafile
    fsaname = Fasta.find(mdobject.fastafile)
    fsaD = Fasta.load(fsaname)
    probes = fsaD.keys()
    if not probefile:
        PROBESET = MotifMetrics.ProbeSet('YEAST')
        #PROBESET= pick_genome(fsaname)
    for key, seq in fsaD.items():
        PROBESET.probes[key] = seq

    for motif in motifs:
        if motif.pvalue == 1:
            motif.pvalue = PROBESET.p_value(motif, probes, 'v')
        if motif.church == 1:
            motif.church = PROBESET.church(motif, probes, 'v')
        if motif.E_site == None:
            motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v')
        #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v')
        #if motif.E_seq  == None: motif.E_seq  = PROBESET.E_seq(motif,probes,'v')
        if motif.ROC_auc == None:
            motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v')
        if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v')
        if motif.frac == None:
            motif.frac = PROBESET.frac(motif, probes, 'v', 0.7)
        if re.search('\.meme$', filename):
            motif.MAP = -math.log(motif.evalue) / math.log(10)
        if 1 and (motif.CRA == None):
            try:
                pass
                CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,
                                                   probes,
                                                   'v',
                                                   tuple='YES')
                motif.CRA = CRA
                motif.Cfrac = Cfrac
            except:
                pass

    if re.search('\.meme$', filename):
        mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue))
    else:
        mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church))

    MotifTools.save_motifs(motifs, tamoname)
Esempio n. 33
0
    for header, seq in fg.items() :
        num_peak_bases += len(seq)


if __name__ == '__main__' :

    opts, args = parser.parse_args(sys.argv[1:])

    if len(args) < 3 :
        parser.error('Must provide three non-option arguments')

    sample_type, organism, fg_fn = args[:3]

    settings_dict = get_org_settings(organism)

    fg = Fasta.load(fg_fn)
    bg = rejection_sampling(fg,settings_dict)


###############################################################
# start Chris' code from rej_samp_bg_rand2.py
    the_genes={} #list of distances to nearest TSS

    # for each peak find the chromosome, distance to nearest
    # gene, size of peaks in bases, and GC content
    the_chrs,dists,sizes,gcs=[],[],[],[]

    # number of bases in the fg sequences
    size=0

    for key in pos_seqs.keys():
Esempio n. 34
0
from TAMO.seq import Fasta


def groupMiRsBym2m8(miRNAs):
    """
    miRNAs = dict(k='miRname', v='miRseq')
    Returns seedDict = dict(k='m2m8Seq', v=[miRnames])
    """
    seedDict = {}
    
    for m in miRNAs:
        m2m8 = miRNAs[m][1:8]
        if m2m8 in seedDict:
            seedDict[m2m8].append(m)
        else:
            seedDict[m2m8] = [m]
        
    for each in seedDict:
        print '%s' % (', '.join(seedDict[each]))
        
    return seedDict


if __name__ == '__main__':
    miRNAs = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.fa'
    miRNAs = Fasta.load(miRNAs)
    sD = groupMiRsBym2m8(miRNAs)
    
Esempio n. 35
0
from TAMO.seq import Fasta

fastaFile = "/Users/biggus/Documents/James/AedesPeptides/aaegypti.PEPTIDES-AaegL1.1.reformatted.fa"

fDict = Fasta.load(fastaFile, lambda x: x.split("|")[1])

nrNames = []
rNames = []
for each in fDict.keys():
    if each not in nrNames:
        nrNames.append(each)
    else:
        if each not in rNames:
            rNames.append(each)

print "%s names were repeated at least once." % (len(rNames))

x = 1
Esempio n. 36
0
def info2seeds(N,infofile,probefile,species='YEAST'):
    if species == 'human':
	species = 'HUMAN'
    G    = ProbeSet(species)
    IDs  = G.ids_from_file(probefile)
    Q    = EM.theMarkovBackground.zeroth()
    seqs = []
    
    if re.search('.info$',infofile):
        #I    = infoana.Infofile(infofile,'DONT REMOVE QUERY')
        I    = infoana.Infofile(infofile)
        print "# Loading infofile: %s"%infofile
        print I
        seqs = map(lambda x: 'NNNN%sNNNN'%x, I.bsites2seqs(50.0))
    elif re.search('.fsa$',infofile):
        fsaDict = Fasta.load(infofile)
        probes = fsaDict.keys()
        #sequence_repository = KenzieSequences()
        cons_pickle = infofile.split('.')[0] + '.cpickle'
        try:
            CFH = open(cons_pickle, 'r')
            ConsDict = pickle.load(CFH)
            CFH.close()
        except:
            ConsDict = {}
            for probe in probes:
                seqs = []
                cons = []
                try:
                    seq_list = G.alignments[probe]
                except:
                    continue
                if (seq_list!=[]):
                    cer_seq = seq_list[0][1]
                else:
                    cer_seq = ''
                cer_seq = cer_seq.upper()
                numg = len(seq_list) - 1
                for i in range(1,4):
                    try:
                        seqs.append(seq_list[i][1].upper())
                    except:
                        seqs.append('')
                    cons.append([])
                for position in range(len(cer_seq)):
                    ref = cer_seq[position]
                    for i in range(3):
                        if (seqs[i]==''): continue 
                        if (seqs[i][position]!=ref):
                            cons[i].append(1)
                        else:
                            cons[i].append(0)
                ConsDict[probe] = cons
            CFH = open(cons_pickle, 'w')
            pickle.dump(ConsDict, CFH)
            CFH.close()
       
        for probe in probes:
            superseq = ''
            try:
                seq_list = G.alignments[probe]
            except:
                continue
            for seq in seq_list:
                subseq = seq[1].replace('-','')
                subseq = subseq.replace('.','')
                seqs.append(subseq)
    
    if not N:
        nmers = seqs
    else:
        if (N<11):
            nmers= ConvergeMotifTools.top_nmers(N,seqs)
        else:
            gaplen = N - 2*(N/3)
            gr = ''
            for i in range(gaplen):
                gr = gr + 'N'
            nmers = ConvergeMotifTools.top_nmers(N,seqs,0,'',1)
            gnmers = []
            for nmer in nmers:
                gnmers.append(nmer[0:(N/3)]+gr+nmer[(N/3):2*(N/3)])
            nmers = gnmers
        if len(nmers) > 201: nmers = nmers[0:200]
        
    print "Scoring enrichment of %d nmers from .info file"%len(nmers)

    nmers_scoresT = []
    for nmer in nmers:
        if nmer[0:(N/3)].isalpha():
            p = G.p_value(nmer,IDs,'verbose')
            #if (species=='Ciona'): ng = 2
            #else: ng = 4
            #p_cons = conservation_pvalue(nmer,IDs,fsaDict,ConsDict,ng)
            #if (p_cons<0.1):
            nmers_scoresT.append((nmer,p))
    nmers_scoresT.sort(lambda x,y: cmp(x[1],y[1]))
    #for tup in nmers_scoresT:
    #    print tup
    last = min(20,len(nmers_scoresT))
    models = []
    for i in range(last):
        seq = nmers_scoresT[i][0]
        m = ConvergeMotifTools.Motif('',Q)
        m.compute_from_text(seq,0.1)
        models.append(m)
    return(models)
Esempio n. 37
0
from gusPyCode.MDAP_proj.MDAP_defs import shuffleSeqDict
from TAMO.seq import Fasta
from gusPyCode.defs.bioDefs import softMaskDict2HardMask
from time import time
from gusPyCode.defs.mosqData import promoterSeqPaths

# User Variables:
inFile   = promoterSeqPaths.Aa_2000bpUp_softMasked
outFile  = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Aedes/aedes2KBupStreamTSS.UnMasked.geneStrand.shuffledSeqs.1.fas'
hardMask = None


d = Fasta.load(inFile)
#d = {1:'AACTGCANACTGACNNNACTGATGNNN'}

if not hardMask:
    for x in d:
        d[x] = d[x].upper()

t1 = time()
sD = shuffleSeqDict(d)
t2 = time()

Fasta.write(sD,outFile)

print 'Shuffling took %.2f min.' % ((float(t2)-t1)/60)
Esempio n. 38
0
tOut = "/Users/biggus/Documents/James/Data/ReClustering/Python_CRM/tamoTimeIt.6memeMotifs.35seqs.30runs.txt"
mOut = "/Users/biggus/Documents/James/Data/ReClustering/Python_CRM/motilityTimeIt.6memeMotifs.35seqs.30runs.txt"

genes = 35
runs = 30

tmoFiles = [
    "/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/RandSplitFastas/MemeResults/Clus2_247gene_0.8_Apr16_14-46-36.meme.txt.tmo",
    "/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/RandSplitFastas/MemeResults/Clus2_247gene_0.8_Apr16_14-46-33.meme.txt.tmo",
]

fastaPath = (
    "/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas"
)
seqs = Fasta.load(fastaPath)

targetGenes = "/Users/biggus/Documents/James/Data/ReClustering/kmedsPear33Clus50x_2/Clus2_247genes.genes.txt"
targetGenes = map(lambda l: l.strip(), open(targetGenes, "rU"))
targetGenes = targetGenes[:genes]
for i in range(len(targetGenes)):
    targetGenes[i] = seqs[targetGenes[i]]


motifs = []
tMotifs = []
mMotifs = []
for t in tmoFiles:
    Ms = loadTMOs(t)
    motifs.extend(Ms)
for i in range(len(motifs)):
Esempio n. 39
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:i:", ["help", "output="])  # AD added 'i'
    except getopt.GetoptError:
        usage()
        sys.exit(1)
    if not opts:
        usage()
        sys.exit(1)
        

    print "#" + ' '.join(sys.argv)
    fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.75) # AD changed thresh val to 0.75 from 0.7
    ambigs = []

    scale   = 50.0 / 1000.0
    
    motifs = []
    for opt, value in opts:
        #print opt, value
        if   opt ==  '-f':  fastafile = value
        elif opt ==  '-m':  motifs.extend(MotifTools.txt2motifs(value))
        elif opt ==  '-n':  motifnums = [int(x) for x in value.split(',')]
        elif opt ==  '-L':  labels    = list(value)
        elif opt ==  '-t':  thresh    = float(value)
        elif opt ==  '-a':  ambigs.extend(value.split(','))
        elif opt ==  '-S':  scale     = float(value)
        elif opt ==  '-i':  motiffile = value  # AD added this option to ACTUALLY supply the tamo motif file at the command-line.  The code to deal with motiffiles already existed. There was just no code for User to supply one.
        
    probes = Fasta.load(fastafile)
    
    if motiffile:
        for f in motiffile.split(','):      # AD added this to allow supplying multiple tamo files at the prompt like you can supply multiple motifs
            motifs.extend(MotifTools.load(f))
    if ambigs:
        for ambig in ambigs:
            motifs.append( MotifTools.Motif_from_text(ambig,0.1) )
    if not motifnums:  motifnums = range(len(motifs))
    print '# %d: %s'%(len(motifs),motifnums)
    for i in range(len(motifnums)):
        motif = motifs[motifnums[i]]
        if labels and i < len(labels):
            txt = labels[i]
        else:
            txt = '%d'%i
        print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh)

    probehits = {}
    for key in probes.keys():
        hits_by_motif = []
        save_flag     = 0
        if re.search('[BDHU]',probes[key]): continue
        for num in motifnums:
            result = motifs[num].scan(probes[key],thresh*motif.maxscore)
            if result[0]:
                hits_by_motif.append(result)
                save_flag = 1
            else:
                hits_by_motif.append(None)
        if save_flag:
            probehits[key]=hits_by_motif

    #scale   = .1
    maxw = 40
    for key in probehits.keys():
        l       = len(probes[key])
        a       = list('-'* int(scale*l) )
        a.extend( list(' '*10 ) )
        desc    = []
        matches = probehits[key]
        for i in range(len(matches)):
            if matches[i]:
                subseqs,endpoints,scores = matches[i]
                for idx in range(len(subseqs)):
                    start,stop = endpoints[idx]
                    subseq     = subseqs[idx]
                    score      = scores[idx]
                    if labels and (i<len(labels)): ID = labels[i]
                    else                         : ID = '%d'%i
                    desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score))
                    start = int(start*scale)
                    for offset in range(10):
                        if a[start+offset] == '-':
                            if labels and (i < len(labels)):
                                a[start+offset] = labels[i]
                            else:
                                a[start+offset] = '%d'%i
                            break
        print '%-14s %s'%(key,''.join(a)),
        print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc])
        
    print
    print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
Esempio n. 40
0
def main():
    if len(sys.argv) < 3:
        print "Usage: %s <fasta_file> [width = None ] [options]" % (re.sub(
            '^.*/', '', sys.argv[0]))
        print "Options include:"
        print "                  -valid  <tf_name> Check answers against Transfac"
        print " EM Parameters:"
        print "                  -beta    [0.01]   Beta for pseudocounts"
        print "                  -seedbeta[0.02]   Beta for pseudocounts for seeds from text"
        print "                  -gamma   [0.2]    Gamma (fraction of sequences)"
        print "                  -delta   [0.001]  Convergence criteria"
        print " "
        print " Seeds (not actually proper priors)"
        print "                  -prior            Seqences or motifs for seeds (may be repeated)"
        print "                  -top N   [0]      Include w-mers in top N probes"
        print "                  -gap    string    sample gapped motifs"
        print "                  -TF               Seed with (all) TRANSFAC PSSMs (buggy)"
        print "                  -info <file.info> for structural priors"
        print "                  -pad              add NN..NN to seed"
        print " "
        print " Genome / Background model "
        print "                  -human (250,1000) Use Human Background model"
        print "                  -Y2K, -Y5C        Use Yeast Upstream Intergenic regions (2000, 500)"
        print "                  -B                Use Bacterial Orfs"
        print " "
        print "Examples:"
        print " %s t.fsa 5 -prior GGGTA -prior AAAAAC " % (
            sys.argv[0].split('/')[-1])
        print "   will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC"
        print
        print " %s t.fsa 5 -info CUP9.info -gamma 0.5 " % (
            sys.argv[0].split('/')[-1])
        print "   will start an EM with Enriched seeds in CUP9.info, with"
        print "   Gamma expectation of 50% of all probes"
        print
        print " %s t.fsa -prior MCM1_5.tamo:0 " % (sys.argv[0].split('/')[-1])
        print "   will start an EM with 0th motif of the file MCM1_5.tamo"
        print "   as a seed"
        print
        sys.exit(1)
    fastafile = sys.argv[1]

    #Echo the command line
    print "#" + ' '.join(map(lambda x: re.sub(' ', '\ ', x), sys.argv))

    if sys.argv[2].isdigit():
        width = sys.argv[2]
    else:
        width = None

    algorithm = ''
    beta = ''
    seedbeta = ''
    cbeta = ''
    deltamin = ''
    gamma = 0.2
    infofile = ''
    seedmodels = []
    species = 'YEAST'
    valid_tfs = []
    gapped_syl = None
    gapflank = 0
    gapweight = 0.2
    enrichfact = 0.7
    pmax = 0  #False
    TFSEEDS = 0
    TFMids = []
    pad = None
    padlen = 0
    thetas = []
    seed_count = 0  #Default: Take the top 0
    seed_s = []  #Initialize seq array
    sp_seed = 0
    '''Parse command-line arguments'''
    for tok, i in zip(sys.argv, xrange(len(sys.argv))):
        if tok == '-top': seed_count = int(sys.argv[i + 1])
        elif tok == '-greedy': algorithm = "GREEDY"
        elif tok == '-prior': seed_s.append(sys.argv[i + 1])
        elif tok == 'sp': sp_seed = 1
        elif tok == '-beta': beta = float(sys.argv[i + 1])
        elif tok == '-beta': seedbeta = float(sys.argv[i + 1])
        elif tok == '-cbeta': cbeta = float(sys.argv[i + 1])
        elif tok == '-thetas':
            for j in range(int(sys.argv[i + 1])):
                thetas.append(float(sys.argv[i + j + 2]))
        elif tok == '-gamma':
            gamma = float(sys.argv[i + 1])
        elif tok == '-delta':
            deltamin = float(sys.argv[i + 1])
        elif tok == '-info':
            infofile = sys.argv[i + 1]
        elif tok == '-valid':
            valid_tfs.append(sys.argv[i + 1])
        elif tok == '-w':
            width = sys.argv[i + 1]
        elif tok == '-width':
            width = sys.argv[i + 1]
        elif tok == '-gap':
            gapped_syl = sys.argv[i + 1]
        elif tok == '-gapflank':
            gapflank = int(sys.argv[i + 1])
        elif tok == '-gapweight':
            gapweight = float(sys.argv[i + 1])
        elif tok == '-enrichfact':
            enrichfact = float(sys.argv[i + 1])
        elif tok == '-pmax':
            pmax = 1
        elif tok == '-Y2K':
            species = "YEAST_2000_UP"
        elif tok == '-Y5C':
            species = "YEAST_500_UP"
        elif tok == '-B':
            species = "BAC_ORF"
        elif tok == '-Ch22':
            species = "Ch22"
        elif tok == '-genome':
            species = sys.argv[i + 1]
        elif tok == '-pad':
            pad = sys.argv[i + 1]
            padlen = sys.argv[i + 2]
        elif tok == '-TF':
            TFSEEDS = 1
            for j in range(i + 1, len(sys.argv)):
                if re.match('M0', sys.argv[j]):
                    TFMids.append(sys.argv[j])
                else:
                    break
        elif tok == '-human':
            _s = ''
            if sys.argv[i + 1].isdigit(): _s = '_' + sys.argv[i + 1]
            else: _s = ''
            species = 'HUMAN' + _s

    seqs = []
    fsaD = Fasta.load(fastafile)
    probes = fsaD.keys()
    '''
    for probeid in fsaD.keys():
        seqs.append  (fsaD    [probeid])
    '''
    numprobes = len(probes)
    #print "numprobes: %i"%numprobes
    if not ('-random_background' in sys.argv or '-nomarkov' in sys.argv):
        EM.loadMarkovBackground(fastafile, numprobes, species)

    #seqs     = EM.fasta2seqs(fastafile)
    all_seqs = seqs
    seed_s.extend(seqs[0:min(seed_count, len(seqs))])
    #not necessary --- seed_c.extend(c_seqs[0:min(seed_count,len(seqs))])

    if infofile and width == 'info':
        width = info2width(infofile)
    elif width != None:
        width = int(width)

    #Alternate source of seeds
    if infofile:
        if 1 or width:
            seedmodels.extend(info2seeds(width, infofile, fastafile, species))
        else:
            print 'Error: need to specify motif width w/ .info file'

    #Any -prior pointers to motifs in other files?
    (seed_s, motifs) = parse_priors(seed_s)
    seedmodels.extend(motifs)

    #Should we get seeds from TRANSFAC?
    if TFSEEDS:
        tf = []
        D = tfmats()
        if not TFMids:
            keys = D.keys()
        else:
            keys = []
            for TFMid in TFMids:
                for key in D.keys():
                    if key[0:6] == TFMid:
                        keys.append(key)
                        break
        for key in keys:
            m = D[key]
            m.seednum = int(re.sub('M0*', '', key.split()[0]))
            m.seedtxt = '%-24s %s' % (m, key)
            tf.append(m)
        tf.sort(lambda x, y: cmp(x.seednum, y.seednum))
        seedmodels.extend(tf)
        #seedmodels.append(tf[33])

    if gapped_syl:
        gapped_priors = gapped_motifs(gapped_syl)
        gapped_priors = map(lambda x: 'N' + x + 'N', gapped_priors)
        seed_s.extend(gapped_priors)

    if pad:
        print '# Padding models with NN-m-NN'
        newmodels = []
        for m in seedmodels:
            newmodels.append(m[-2, m.width + 2])
        seedmodels = newmodels
    '''
    Set everything up and GO!!
    '''
    global theEM
    theEM = EM.EM(seed_s, [], [], width, "VERBOSE")
    if beta: theEM.beta = beta
    if cbeta: theEM.cbeta = cbeta
    if deltamin: theEM.deltamin = deltamin
    if seedbeta: theEM.seedbeta = seedbeta
    if thetas: theEM.thetas = thetas
    theEM.param['gamma'] = gamma
    theEM.probeids.extend(probes)
    theEM.seqs.extend(all_seqs)
    #theEM.cons_seqs.extend(c_seqs)
    theEM.models = seedmodels
    theEM.gapflank = gapflank
    theEM.gapweight = gapweight
    theEM.report()
    theEM.EM_Cstart()  #GO!!

    #print "#Sorting candidates"
    #sys.stdout.flush()
    #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP))

    #sys.exit(0)
    '''
    Compute some metrics
    '''
    print "#Loading Genome %s" % species
    sys.stdout.flush()
    if species == 'human':
        Genome = ProbeSet('HUMAN', enrichfact)
    else:
        Genome = ProbeSet(species, enrichfact)
    ids = Genome.ids_from_file(fastafile)

    #fsaDict = Fasta.load(fastafile)
    #probes = fsaDict.keys()
    #cons_pickle = fastafile.split('.')[0] + '.cpickle'
    for C in theEM.candidates:
        #p_cons = conservation_pvalue(C.pssm,probes,fsaDict,ConsDict,4)
        #print p_cons
        if not pmax:
            w_dict = Genome.w_dict
            for key, i in zip(w_dict.keys(), range(len(C.pssm.thetas))):
                w_dict[key] = C.pssm.thetas[i]
            Genome.w_dict = w_dict
            C.pssm.pvalue = Genome.p_value(C.pssm, ids, 'verbose')
            #print "P-VAL: %f"%(Genome.p_value(C.pssm,ids,'verbose')*p_cons)
            C.pssm.church = Genome.church(C.pssm, ids)
        else:
            (p, frac) = Genome.best_p_value(C.pssm, ids)
            C.pssm.pvalue = p
            C.pssm.threshold = frac * C.pssm.maxscore
            print "Bests:", p, frac

    for valid_tf in valid_tfs:
        C.pssm.valid = Validate.validate(C.pssm, valid_tf, 'Verbose',
                                         "Want Tuple")
    '''
    Print out all motifs (sorted by Enrichment) in an AlignACE-like form
    '''

    theEM.candidates.sort(lambda x, y: cmp(x.pssm.pvalue, y.pssm.pvalue))
    for C, i in zip(theEM.candidates, range(len(theEM.candidates))):
        C.pssm.maxscore = -100  #May have side effects.  Recompute when done
        if C.pssm.valid:
            _t = C.pssm.valid
            if not _t[0]:
                vstring = "(--- %8.4f %8.4f %s)" % (_t[1], _t[2], _t[3])
            else:
                vstring = "(HIT %8.4f %8.4f %s)" % (_t[1], _t[2], _t[3])
        else:
            vstring = ''
        C.pssm._maxscore()  #Recomputed
        print "Log-odds matrix for Motif %3d %s" % (i, C)
        C.pssm._print_ll()
        print "Sequence Logo"
        C.pssm._print_bits()
        flush()
        #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9))
        flush()
        m = C.pssm
        if not m.__dict__.has_key('gamma'):
            m.gamma = None  #Kludge to deal w/ old shelves
        if m.seedtxt: print "Seed: %3d %s" % (i, m.seedtxt)
        if m.source: print "Source: ", m.source
        if m.gamma: print "Gamma: %7.5f" % m.gamma
        if m.threshold: print "Threshold: %5.2f" % m.threshold
        if m.thetas != []:
            tstr = "thetas:"
            for theta in m.thetas:
                tstr = tstr + " " + str(theta)
            print tstr
        #if C.pssm.seedtxt:
        #    print 'Seed  %3d %-25s'%(i,C.pssm.seedtxt)
        if C.pssm.church != None:
            vstring = 'ch: %5.2f  %s' % (math.fabs(
                math.log(C.pssm.church) / math.log(10)), vstring)
        print "Motif %3d %-25s  nlog(p): %6.3f  %s" % (
            i, C, -math.log(C.pssm.pvalue) / math.log(10), vstring)
        if C.pssm.threshold:
            print "Threshold: %6.3f  %4.1f%%" % (
                C.pssm.threshold, 100.0 * C.pssm.threshold / C.pssm.maxscore)

        C.pssm.maxscore = -1e100  #May have side effects.  Recompute when done
        for seq in C.wmers:
            print seq, i, C.pssm.scan(seq)[2][0]
        C.pssm._maxscore()  #Recomputed
        print '*' * len(seq)
        print "MAP Score: %f" % C.MAP
        sys.stdout.flush()
    sys.stdout.flush()
    sys.exit(0)  #Avoid ridiculous python cleanup times