Exemple #1
0
def info2seeds(N,infofile,probefile,species='YEAST'):
    G    = ProbeSet(species)
    IDs  = G.ids_from_file(probefile)
    Q    = EM.theMarkovBackground.zeroth()
 
    seqs = Fasta.seqs(infofile)
    
    if not N:
        nmers = seqs
    else:
        nmers= MotifTools.top_nmers(N,seqs)
        if len(nmers) > 1000: nmers = nmers[0:1000]
        
    print "Scoring enrichment of %d nmers from %s"%len(nmers,infofile)
    sys.stdout.flush()
    
    nmers_scoresT = []
    for nmer in nmers:
        if nmer.isalpha():
            p = G.p_value(nmer,IDs,'') #'verbose'
            nmers_scoresT.append((nmer,p))
    nmers_scoresT.sort(lambda x,y: cmp(x[1],y[1]))
    last = min(20,len(nmers_scoresT))
    models = []
    for i in range(last):
        seq = nmers_scoresT[i][0]
        m = MotifTools.Motif('',Q)
        m.compute_from_text(seq,0.1)
        models.append(m)
    for tup in nmers_scoresT[0:40]:
        print tup
    return(models)
Exemple #2
0
def main():
    if len(sys.argv) < 2:
        print "Usage: %s <fasta_file> [width = None ] [options]"%(re.sub('^.*/','',sys.argv[0]))
        print "Options include:"
        print ""
        print " EM Parameters:"
        print "                  -beta    [0.01]   Beta for pseudocounts"
        print "                  -seedbeta[0.02]   Beta for pseudocounts for seeds from text"
        print "                  -gamma   [0.2]    Gamma (fraction of sequences)"
        print "                  -delta   [0.001]  Convergence criteria"
        print " "
        print " Seeds (not actually proper priors)"
        print "                  -prior            Seqences or motifs for seeds (may be repeated)"
        print "                  -top N   [0]      Include w-mers in top N probes"
        print "                  -gap    string    sample gapped motifs"
#       print "                  -TF               Seed with (all) TRANSFAC PSSMs (buggy)"
        print "                  -kmerseeds        Use kmers with best enrichment score as seeds for EM"
        print "                  -pad              add NN..NN to seed"
        print " "
        print " Genome / Background model "
        print "                  -human (250,1000) Use Human Background model"
        print "                  -g genome.fsa     Use specicied Fasta file as background (searches first for matching frequency file)"
#       print "                  -Y2K, -Y5C        Use Yeast Upstream Intergenic regions (2000, 500)"
#       print "                  -B                Use Bacterial Orfs"
        print " " 
        print "Examples:"
        print " %s t.fsa 5 -prior GGGTA -prior AAAAAC "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 3 seeds: GGGTA, AAAAA, and AAAAC"
        print 
        print " %s t.fsa 5 -info CUP9.info -gamma 0.5 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with Enriched seeds in CUP9.info, with"
        print "   Gamma expectation of 50% of all probes"
        print 
        print " %s t.fsa -prior MCM1_5.tamo:0 "%(sys.argv[0].split('/')[-1])
        print "   will start an EM with 0th motif of the file MCM1_5.tamo"
        print "   as a seed"
        print 
        sys.exit(1)
    fastafile = sys.argv[1]

    #Echo the command line
    print "#" + ' '.join(map(lambda x: re.sub(' ','\ ',x), sys.argv))

    if sys.argv[2].isdigit():
        width = sys.argv[2]
    else: width = None
    
    algorithm = ''
    beta      = ''
    seedbeta  = ''
    deltamin  = ''
    gamma     = 0.2
    infofile  = ''
    seedmodels= []
    species   = 'YEAST'
    valid_tfs = [] #NOT USED
    gapped_syl= None
    gapflank  = 0
    gapweight = 0.2
    enrichfact= 0.7
    pmax      = 0  #False
    TFSEEDS   = 0
    TFMids    = []
    pad       = None
    bgfile    = None

    seed_count = 0   #Default: Take the top 0
    seed_s     = []  #Initialize seq array

    '''Parse command-line arguments'''
    for tok,i in zip(sys.argv,xrange(len(sys.argv))):
        if   tok == '-top'   :   seed_count = int(sys.argv[i+1])
        elif tok == '-greedy':   algorithm  = "GREEDY"
        elif tok == '-prior' :   seed_s.append(sys.argv[i+1])
        elif tok == '-beta'  :   beta       = float(sys.argv[i+1])
        elif tok == '-seedbeta': seedbeta   = float(sys.argv[i+1])
        elif tok == '-gamma' :   gamma      = float(sys.argv[i+1])
        elif tok == '-delta' :   deltamin   = float(sys.argv[i+1])
        elif tok == '-kmerseeds' :   infofile   = 1
        elif tok == '-valid' :   valid_tfs.append(sys.argv[i+1]) #NOT USED
        elif tok == '-w'     :   width      = sys.argv[i+1]
        elif tok == '-width' :   width      = sys.argv[i+1]
        elif tok == '-gap'   :   gapped_syl = sys.argv[i+1]
        elif tok == '-gapflank' :gapflank   = int(sys.argv[i+1])
        elif tok == '-gapweight':gapweight  = float(sys.argv[i+1])
        elif tok == '-enrichfact':enrichfact= float(sys.argv[i+1])
        elif tok == '-pmax'  :   pmax       = 1
        elif tok == '-Y2K'   :   species    = "YEAST_2000_UP"
        elif tok == '-Y5C'   :   species    = "YEAST_500_UP"
        elif tok == '-B'     :   species    = "BAC_ORF"
        elif tok == '-Ch22'  :   species    = "Ch22"
        elif tok == '-genome':   species    = sys.argv[i+1]
        elif tok == '-pad'   :   pad        = "TRUE"
        elif tok == '-bgfile':   bgfile     = sys.argv[i+1]
        elif tok == '-TF'    :  #NOT USED (TRANSFAC NOT SUPPLIED WITH DISTRIBUTION)
            TFSEEDS = 1
            for j in range(i+1,len(sys.argv)):
                if re.match('M0',sys.argv[j]):
                    TFMids.append(sys.argv[j])
                else:
                    break
        elif tok == '-human' :
            _s = ''
            if sys.argv[i+1].isdigit(): _s = '_'+sys.argv[i+1]
            else:                       _s = ''
            species    = 'HUMAN'+_s

    if infofile: infofile = fastafile

    if bgfile:
        EM.loadMarkovBackground(bgfile)
    elif not ('-random_background' in sys.argv or '-nomarkov' in sys.argv):
        EM.loadMarkovBackground(species)
    else:
        EM.theMarkovBackground = EM.Zeroth()

    fsaD     = Fasta.load(fastafile)
    Fasta.delN(fsaD)
    seqs     = fsaD.values()
    probes   = fsaD.keys()
    all_seqs = seqs
    seed_s.extend(seqs[0:min(seed_count,len(seqs))])

    if infofile and width=='info':
        width = info2width(infofile)
    elif width != None:
        width = int(width)

    #Alternate source of seeds
    if infofile:
        if 1 or width:
            seedmodels.extend(info2seeds(width,infofile,fastafile,species))
        else:
            print 'Error: need to specify motif width w/ .info file'
    
    #Any -prior pointers to motifs in other files?
    (seed_s, motifs) = parse_priors(seed_s)
    seedmodels.extend(motifs)

    #Should we get seeds from TRANSFAC?
    if TFSEEDS: #NOT USED
        tf = []
        D  = tfmats()
        if not TFMids:
            keys = D.keys()
        else:
            keys = []
            for TFMid in TFMids:
                for key in D.keys():
                    if key[0:6] == TFMid:
                        keys.append(key)
                        break
        for key in keys:
            m = D[key]
            m.seednum = int(re.sub('M0*','',key.split()[0]))
            m.seedtxt = '%-24s %s'%(m,key)
            tf.append(m)
        tf.sort(lambda x,y: cmp(x.seednum,y.seednum))
        seedmodels.extend(tf)
        #seedmodels.append(tf[33])

    if gapped_syl:
        gapped_priors = gapped_motifs(gapped_syl)
        gapped_priors = map(lambda x:'N'+x+'N', gapped_priors)
        seed_s.extend(gapped_priors)

    if pad:
        print '# Padding models with NN-m-NN'
        newmodels = []
        left  = MotifTools.Motif_from_text('@')
        right = MotifTools.Motif_from_text('N')
        for m in seedmodels:
            newmodels.append(left + m + right)
            print left + m + right
        seedmodels = newmodels

    '''
    Set everything up and GO!!
    '''
    global theEM
    theEM = EM.EM(seed_s,[],width,"VERBOSE")
    if beta:     theEM.beta     = beta
    if deltamin: theEM.deltamin = deltamin
    if seedbeta: theEM.seedbeta = seedbeta
    theEM.param['gamma']        = gamma
    theEM.seqs.extend(all_seqs)
    theEM.models    = seedmodels
    theEM.gapflank  = gapflank
    theEM.gapweight = gapweight
    theEM.report()
    theEM.EM_Cstart()    #GO!!

    #print "#Sorting candidates"
    #sys.stdout.flush()
    #EM.candidates.sort(lambda x,y: cmp(y.MAP,x.MAP))


    '''
    Compute some metrics
    '''
    print "#Loading Genome %s"%species ; sys.stdout.flush()
    Genome = ProbeSet(species,enrichfact)
    ids    = Genome.ids_from_file(fastafile)
    
    for C in theEM.candidates:
        if not pmax:
            C.pssm.pvalue = Genome.p_value(C.pssm,ids,'verbose')
            C.pssm.church = Genome.church(C.pssm,ids)
            C.pssm.frac   = Genome.frac(C.pssm,probes,None,0.7)
        else:
            (p,frac) = Genome.best_p_value(C.pssm,ids)
            C.pssm.pvalue    = p
            C.pssm.threshold = frac * C.pssm.maxscore
            print "Bests:",p,frac

        matching             = Genome.matching_ids(C.pssm,[],factor=0.7)
        matchbound           = [x for x in matching if x in probes]
        C.pssm.numbound      = len(probes)
        C.pssm.nummotif      = len(matching)
        C.pssm.numboundmotif = len(matchbound)
        sys.stdout.flush()

    
    '''
    Print out all motifs (sorted by Enrichment) in an AlignACE-like form
    '''

    theEM.candidates.sort(lambda x,y: cmp(x.pssm.pvalue,y.pssm.pvalue))
    for C,i in zip(theEM.candidates,range(len(theEM.candidates))):
        C.pssm.maxscore = -100  #May have side effects.  Recompute when done
        if C.pssm.valid:  #NOT USED
            _t = C.pssm.valid
            if not _t[0]:
                vstring = "(--- %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
            else:
                vstring = "(HIT %8.4f %8.4f %s)"%(_t[1],_t[2],_t[3])
        else:
            vstring = ''
        C.pssm._maxscore()     #Recomputed

        MotifTools.print_motif(C.pssm,20,i)
        sys.stdout.flush()
        continue
    
        #Antiquated stuff  -- Remove !!
        print "Log-odds matrix for Motif %3d %s"%(i,C)
        C.pssm._print_ll()
        print "Sequence Logo"
        C.pssm._print_bits()
        flush()
        #print '# %3d matching sequences at 90%%'%len(C.pssm.bestseqs(C.pssm.maxscore * 0.9))
        flush()
        m = C.pssm
        if not m.__dict__.has_key('gamma'):  m.gamma = None #Kludge to deal w/ old shelves
        if m.seedtxt:     print "Seed: %3d %s"%(i,m.seedtxt)
        if m.source:      print "Source: ",m.source
        if m.gamma:       print "Gamma: %7.5f"%m.gamma
        if m.threshold:   print "Threshold: %5.2f"%m.threshold
        #if C.pssm.seedtxt:
        #    print 'Seed  %3d %-25s'%(i,C.pssm.seedtxt)
        if C.pssm.church != None: vstring = 'ch: %5.2f  %s'%(
            math.fabs(math.log(C.pssm.church)/math.log(10)), vstring)
        print "Motif %3d %-25s  nlog(p): %6.3f  %s"%(i,C,-math.log(C.pssm.pvalue)/math.log(10),vstring)
        if C.pssm.threshold:
            print "Threshold: %6.3f  %4.1f%%"%(
                C.pssm.threshold, 100.0*C.pssm.threshold/C.pssm.maxscore)
            

        C.pssm.maxscore = -1e100  #May have side effects.  Recompute when done
        for seq in C.wmers:
            print seq,i,C.pssm.scan(seq)[2][0]
        C.pssm._maxscore()      #Recomputed
        print '*'*len(seq)
        print "MAP Score: %f"%C.MAP
        sys.stdout.flush()
    sys.stdout.flush()
    sys.exit(0) #Avoid ridiculous python cleanup times