Beispiel #1
0
def tamofile2motifs(filename):
    FID = open(filename,'r')
    lines = FID.readlines()
    FID.close()
    motifs   = []
    seedD    = {}
    seedfile = ''
    for i in range(len(lines)):
        if lines[i][0:10] == 'Log-odds matrix'[0:10]:
            w = len(lines[i+1].split())-1
            ll = []
            for pos in range(w):
                ll.append({})
            for j in range(0,4):
                toks = lines[i+j+2].split()
                L = toks[0][1]
                for pos in range(w):
                    ll[pos][L] = float(toks[pos+1])
            m = MotifTools.Motif_from_ll(ll)
            motifs.append(m)
        if lines[i][0:6] == 'Motif '[0:6]:
            toks =  lines[i].split()
            motifs[-1].nseqs    = float(re.sub('[\(\)]','',toks[3]))
            motifs[-1].totalbits= float(toks[5])
            motifs[-1].MAP      = float(toks[7])
            motifs[-1].seeddist = float(toks[9])
            motifs[-1].seednum  = int(toks[10][0:-1])
            motifs[-1].pvalue   = math.pow(10,-float(toks[12]))
            if 'ch:' in toks:
                motifs[-1].church = math.pow(10,-float(toks[14]))
        if lines[i][0:10] == 'Threshold: '[0:10]:
            toks =  lines[i].split()
            motifs[-1].threshold= float(toks[1])
        if lines[i][0:5] == 'Seed '[0:5]:
            toks = lines[i].split()
            id = int(toks[1][0:-1])  #'10:' -> '10'
            seedD[id] = toks[2]
        if lines[i][0:7] == 'Source: '[0:7]:
            motifs[-1].source = lines[i][7:].strip()
        if lines[i][0:6] == 'Gamma: '[0:6]:
            motifs[-1].gamma = float(lines[i][6:])
        if lines[i][0:6] == 'Evalue: '[0:6]:
            motifs[-1].evalue = float(lines[i][7:].strip())
        if lines[i].find('Using')>=0 and lines[i].find('as seeds')>=0:
            '''#Using all (132) motifs in SLT_081503.seeds as seeds:'''
            seedfile = lines[i].split()[-3]
    for i in range(len(motifs)):
        if seedfile: motifs[i].seedfile = seedfile
        seednum = motifs[i].seednum
        if seedD.has_key(seednum):
            motifs[i].seedtxt = seedD[seednum]
    return(motifs)
Beispiel #2
0
def motif_matrix(fsa, motif, outfile, genome='mm9'):
    if genome == 'hg18':
        markov = "/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov"
    else:
        markov = "/nfs/data/cwng/chipseq/hypotheses/Mouse.markov"

    #Load motif and background adjust PSSM
    m = MotifTools.load(motif)
    EM.loadMarkovBackground(markov)
    bg = EM.theMarkovBackground.zeroth()
    F = Fasta.load(fsa, key_func=lambda x: x)
    seqs = F.values()
    n_seqs = len(seqs)
    n_motifs = len(m)
    SCORES = np.zeros((n_motifs, n_seqs), dtype='float')
    #SHIFTS=np.zeros((n_motifs,n_seqs))

    #out=open(outfile,'w')
    for i, M in enumerate(m):
        ll = M.logP
        EM.loadMarkovBackground(markov)
        bg = EM.theMarkovBackground.zeroth()
        for pos in ll:
            for letter in pos.keys():
                pos[letter] = pos[letter] - math.log(
                    bg[letter]) / math.log(2.0)
        AM = MotifTools.Motif_from_ll(ll)
        #adj_model = MotifTools.Motif_from_ll(ll)
        #adj_model.source = M.source
        #pssm = MDsupport.Motif2c_PSSM(adj_model)
        #w=pssm.width

        #shift=[]
        #scores=[]
        mi, ma = AM.minscore, AM.maxscore

        #F_m={}
        #Search every seq for given motif above threshold t and print motif centered results
        for j, seq in enumerate(seqs):
            seq_fwd = seq.upper()
            #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1]
            #scores_fwd = pssm.score_probe(seq_fwd)
            #scores_rev = pssm.score_probe(seq_rev)
            #max_score=mi
            #max_ind=0
            #for ind,s in enumerate(scores_fwd):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='+'
            #for ind,s in enumerate(scores_rev):
            #    if s> max_score:
            #        max_score=s
            #        max_ind=ind
            #        strand='-'
            max_score = AM.bestscore(seq_fwd)
            mscore = (max_score - mi) / (ma - mi)
            #orig=len(seq_fwd)/2
            #bind=max_ind+w//2
            #d=abs(orig-bind)
            SCORES[i, j] = mscore
            #SHIFTS[i,j]=d
            #out.write('%1.3f\t'%mscore)
        #out.write('\n')
    #out.close()
    #del F
    np.savetxt(outfile, SCORES, fmt='%1.3f')
Beispiel #3
0
def main():
    ##########################################################################################
    #THEME.py: THEME module for performing cross-validated hypothesis testing on transcription
    #factor binding data.
    #Usage: python THEME.py foreground_fasta_file (file path) background_fasta_file (file path)
    #hypothesis_index (integer)  -fse hypothesis_file (file path) -markov markov_background (file path)
    #-motif_file output_file (file path) -cv fold cross-validation (integer)
    ##########################################################################################

    if (len(sys.argv)<4):
        print "Usage: THEME.py foreground.fsa background.fsa hypotheses.txt"
        sys.exit(1)

    fg_file = sys.argv[1]           #get fasta file with foreground sequences
    bg_file = sys.argv[2]           #get fasta file with background sequences
    test_indices = sys.argv[3]      #colon separated indices into fse file
    cv_level = 2                    #default 2-fold cross-validation
    refine = 1
    randomize = 0
    beta = 0.0
    delta = 0.001
    motif_file = 'dummy.out'
    dump_categories_to_file = 0
    test_family = ''
    
    #read in any command line options
    for arg, i in zip(sys.argv,range(len(sys.argv))):
        if (arg == '-cv'):
            cv_level = int(sys.argv[i+1])
        if (arg == '-markov'):
            markov_file = sys.argv[i+1]
        if (arg == '-fse'):
            fse_file = sys.argv[i+1]
        if (arg == '-norefine'):
            refine = 0
        if (arg == '-beta'):
            beta = float(sys.argv[i+1])
        if (arg == '-delta'):
            delta = float(sys.argv[i+1])
        if (arg == '-randomization'):
            randomize = 1
        if (arg == '-motif_file'):
            motif_file = sys.argv[i+1]
        if (arg == '-dump'):
            dump_categories_to_file = 1
        if (arg == '-family'):
            test_family = family
    FH = open(motif_file, 'w')
    FH.write("******THEME Motif Output******")
    FH.close()
    
    random.seed()

    cross_val = THEME(fg_file, bg_file, cv_level, markov_file)
    if ((beta>0.0)and(beta<1.0)) : cross_val.beta = beta/(1-beta)
    cross_val.delta = delta
    cross_val.refine = refine
    cross_val.randomize = randomize
    cross_val.motif_file = motif_file
    if (test_family): cross_val.family = test_family
    if (dump_categories_to_file):
        cross_val.dump = 1

    ###################################################################################
    #get seed sequences that will be tested
    ###################################################################################
    models = []
    fses = MotifTools.load(fse_file)
    if (test_indices=='all'):
        indices = range(len(fses))
    else:
        indices = []
        ivals = test_indices.split(':')
        for v in ivals:
            indices.append(int(v))
    for i in indices:
        ll = fses[i].logP
        bg = EM.theMarkovBackground.zeroth()
        for pos in ll:
            for letter in pos.keys():
                pos[letter] = pos[letter] - math.log(bg[letter])/math.log(2.0)
        adj_bg_model = MotifTools.Motif_from_ll(ll)
        adj_bg_model.source = fses[i].source
        models.append(adj_bg_model)
        
    (m, err) = cross_val.run_CV(models)