コード例 #1
0
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True):
    for fastaf in fastafile:
        ### seqence only needed for length here. MOODS does this parsing again later but without reporting length.
        thisseqname = fastaf.split('/')[-1].split('.')[0]
        thisseq = Bio.SeqIO.read(fastaf, "fasta", alphabet=IUPAC.unambiguous_dna)
        #thisseq = Bio.SeqIO.parse(thisseqname, "fasta", alphabet=IUPAC.unambiguous_dna)
        print 'Doing sequence ', thisseqname, 'length=', len(thisseq)
        
        for pwmf in pwmfiles:
            thispwmname = pwmf.split('/')[-1]
            thispwm = Motif.read(open(pwmf), "jaspar-pfm")
            print ' Doing scanPWM one strands for pwm ', thispwmname, ', length=', len(thispwm[0]), datetime.now()
            onestrandsindexvector = thispwm.scanPWM(thisseq.seq)
            x = onestrandsindexvector[0:len(thispwm)-1].copy() # adding missing bp-values on the end to get the same length as seq.
            x[:]=np.NAN
            onestrandsindexvector=np.append(onestrandsindexvector, x)
            onestrandsindexvector = np.array([onestrandsindexvector]) # takes long time.
            print '  bp with nan score  is ', np.isnan(onestrandsindexvector).sum(), ' expected ', (len(thispwm)-1)
            print '  finding best score per bp, ', datetime.now()
            bestscorevector = getMaxPWMScore( onestrandsindexvector, len(thispwm)) 
            
            print '  writing wiggle for score per start index.', datetime.now()
            vegardswritewiggle(onestrandsindexvector[0,], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname)
            
            print '  writing wiggle for bestscore. ', datetime.now()
            vegardswritewiggle(bestscorevector, name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
コード例 #2
0
def readPwmFile(pwmFileName, outputLocation, pseudocounts=0.0):
    """Reads a PWM file in Jaspar format and returns a Biopython PWM object.

    Keyword arguments:
    pwmFileName -- The path + name of the PWM file.
    outputLocation -- Path to write temporary pseudocount-pwm PWM.
    pseudocounts -- Amount of pseudocounts to add in each matrix cell. (default 0.0)

    Returns:
    pwm -- Biopython PWM object.
    """

    # Adding pseudocounts
    pwmFile = open(pwmFileName,"r");
    tempFileName = outputLocation+pwmFileName.split("/")[-1]+"temp"
    pwmFileT = open(tempFileName,"w")
    for line in pwmFile: pwmFileT.write(" ".join([str(float(e)+pseudocounts) for e in line.strip().split(" ")])+"\n")    
    pwmFile.close()
    pwmFileT.close()

    # Creating PWM from pseudocounted input
    pwmFile = open(tempFileName,"r")
    pwm = Motif.read(pwmFile,"jaspar-pfm")
    pwmFile.close()
    os.system("rm "+tempFileName)
    return pwm
コード例 #3
0
ファイル: openmotif_main.py プロジェクト: MaxRego/CodeSamples
def findPFM(jobID, motifObjList, wordObjDict, numMotifs, outputDir):
    	""" find the PFM(Position Frequency Matrix) for the top motifs """
    	#write the words of motif in Jaspar site format like here: https://github.com/biopython/biopython/blob/master/Doc/cookbook/motif/Arnt.sites
    	alphaList = ['A', 'C', 'G', 'T']
    	siteFileName = ''.join([jobID,'_jasparWordFile'])
    	pfmFileName = ''.join([jobID,'_PFM'])
    	pfmFile = open(pfmFileName, 'wb')
    	counter = 1
    	#write the words
    	for motifObj in motifObjList:
    		seedWord = motifObj.seedWord
        	siteFile = open(siteFileName, 'wb')
        	for word in motifObj.wordList:
            		wordCount = wordObjDict[word].O
            		for i in range(int(wordCount)):
                		siteFile.write('>site ' + str(counter) + '\n' + word + '\n')
                		counter += 1
        	siteFile.close()
        
        	srf = Motif.read(open(siteFileName),'jaspar-sites')
        	srf.make_counts_from_instances()
        	pfmFile.write('\n>' + seedWord + '\n')
        	for alpha in alphaList:
        		pfmFile.write(alpha + ' ' + str(srf.counts[alpha]) + '\n')
	     
	shutil.move(pfmFileName, outputDir) 
    	os.remove(siteFileName)
    	pfmFile.close()
	return
コード例 #4
0
ファイル: hist.py プロジェクト: cauyrd/MSI
def draw_plot(motiffile):
	"""generating histogram"""
	count = []
	control = []
	mf = Motif.read(open(motiffile),'jaspar-pfm')
	for record in SeqIO.parse(sys.argv[2],'fasta'):
		hit = search_motif(mf,record.seq) 
		if hit == None:
			continue
		else:
			count.append(hit)
	for record in SeqIO.parse(sys.argv[3],'fasta'):
		hit = search_motif(mf,record.seq) 
		if hit == None:
			continue
		else:
			control.append(hit)
	# assume the sequence length is 201, center base +/- 100bp
	pylab.figure()
	pylab.hist(count, np.linspace(-100,100,101),color='g')
	num, bin = np.histogram(control, np.linspace(-100,100,101))
	pylab.plot(np.linspace(-100,100,100), num, color='r')
	pylab.xlabel('Distance relative to Stat5 motif')
	pylab.ylabel('No. Stat5 peaks')
	motifname = os.path.basename(motiffile)
	pylab.title(motifname.split('.')[0])
	pylab.savefig(motifname.split('.')[0]+'.png')
コード例 #5
0
def yield_motifs():
    with open('/home/will/LTRtfAnalysis/Jaspar_PWMs.txt') as handle:
        for key, lines in groupby(handle, methodcaller('startswith', '>')):
            if key:
                name = lines.next().strip().split()[-1].lower()
            else:
                tmp = ''.join(lines)
                mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
                yield name, mot
                yield name+'-R', mot.reverse_complement()
    tmp = u"""A 0  0 6 1 0 0 0 4 2 2 0 0 3 
C  1 1 1 0 5 6 4 1 0 0 0 3 5 5 4 0  
G  0 6 0 1 1 0 0 0 0 7 1 1 0 0 1 0 
T  6 0 0 0 1 1 3 5 7 0 0 0 0 2 2 4"""
    mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
    yield 'coup2', mot
    yield 'coup2-R', mot.reverse_complement()
コード例 #6
0
def yield_motifs():
    with open('/home/will/LTRtfAnalysis/Jaspar_PWMs.txt') as handle:
        for key, lines in groupby(handle, methodcaller('startswith', '>')):
            if key:
                name = lines.next().strip().split()[-1].lower()
            else:
                tmp = ''.join(lines)
                mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
                yield name, mot
                yield name+'-R', mot.reverse_complement()
コード例 #7
0
 def _compute(self):
     windowLen = len(Motif.read(open(self._pfmFileName), "jaspar-pfm"))
     pwmScores = self._pwmScoreArrayStat.getResult()
     complementPwmScores = self._complementPwmScoreArrayStat.getResult()
     
     ret = np.zeros((windowLen*2, len(pwmScores)), dtype='float32') + np.float32(np.nan)
     for n in range(0, windowLen):
         ret[2*n,n:] = pwmScores[0:len(pwmScores)-n]
         ret[2*n + 1,n:] = complementPwmScores[0:len(complementPwmScores)-n]
         
     return np.nanmax(ret, axis=0)
コード例 #8
0
def yield_motifs():
    motifdir = '/home/will/Tip60Data/TFdata/'
    with open(motifdir + 'matrix_only.txt') as handle:
        for key, lines in groupby(handle, methodcaller('startswith', '>')):
            if key:
                name = lines.next().strip().split()[-1].lower()
            else:
                tmp = ''.join(lines)
                mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
                yield name, mot
                yield name+'-R', mot.reverse_complement()
コード例 #9
0
    def _compute(self):
        sequence = self._sequenceStat.getResult().valsAsNumpyArray()
        bioSeq = Seq(sequence.tostring(), alphabet=IUPAC.unambiguous_dna)

        thisPwm = Motif.read(open(self._pfmFileName), "jaspar-pfm")
        if self._complement:
            thisPwm = thisPwm.reverse_complement()
            
        try:
            pwmScoreArray = thisPwm.scanPWM(bioSeq)
        except MemoryError, e: #when sequence is shorter than pwm
            return
コード例 #10
0
    def _compute(self):
        from Bio import Motif

        windowLen = len(Motif.read(open(self._pfmFileName), "jaspar-pfm"))
        pwmScores = self._pwmScoreArrayStat.getResult()
        complementPwmScores = self._complementPwmScoreArrayStat.getResult()

        ret = np.zeros((windowLen * 2, len(pwmScores)),
                       dtype='float32') + np.float32(np.nan)
        for n in range(0, windowLen):
            ret[2 * n, n:] = pwmScores[0:len(pwmScores) - n]
            ret[2 * n + 1,
                n:] = complementPwmScores[0:len(complementPwmScores) - n]

        return np.nanmax(ret, axis=0)
コード例 #11
0
ファイル: NgramMotif.py プロジェクト: cauyrd/NgramMotif
def search_motif(motiflist, seq, col, extend):
	"""search motif PWM from sequence list"""
	freq_list = []
	shift_list = []
	mf = Motif.read(open(motiflist),'jaspar-pfm')
	background = 0
	for sequence,control in itertools.izip(seq, col):
		hit = [(pos,score) for pos,score in mf.search_pwm(sequence,threshold=7.0)]
		scores = np.array([score for (pos,score) in hit])
		positions = np.array([pos for (pos,score) in hit])
		if extend != 0:
			dist = [abs(extend-base) if base >=0 else abs(-1*extend-base) for base in positions]
		freq_list.append(len(scores))
		shift_list += dist
		background += len([score for pos,score in mf.search_pwm(control,threshold=7.0)])
	return freq_list, background, len(mf), shift_list
コード例 #12
0
    def _compute(self):
        from Bio.Alphabet import IUPAC
        from Bio.Seq import Seq
        from Bio import Motif

        sequence = self._sequenceStat.getResult().valsAsNumpyArray()
        bioSeq = Seq(sequence.tostring(), alphabet=IUPAC.unambiguous_dna)

        thisPwm = Motif.read(open(self._pfmFileName), "jaspar-pfm")
        if self._complement:
            thisPwm = thisPwm.reverse_complement()

        try:
            pwmScoreArray = thisPwm.scanPWM(bioSeq)
        except MemoryError, e:  #when sequence is shorter than pwm
            return
コード例 #13
0
ファイル: matchpwm.py プロジェクト: cauyrd/Emory
def search_motif(motiflist, seq1, seq2):
	"""search pwm for each motif in the motiflist form sequence"""
	count_all = np.array([[0,0],[0,0]])
	mf = Motif.read(open(motiflist),'jaspar-pfm')
	cutoff = 0.8*mf.max_score()
	for sequence in seq1:
		count = [(pos,score) for pos,score in mf.search_pwm(sequence)]
		max_score = max([j for i,j in count]) if count else -1e5
		if max_score > cutoff:
			count_all[0,0] += 1
		else: 
			count_all[1,0] += 1
	for sequence in seq2:
		count = [(pos,score) for pos,score in mf.search_pwm(sequence)]
		max_score3 = max([j for i,j in count]) if count else -1e5
		if max_score > cutoff:
			count_all[0,1] += 1
		else: 
			count_all[1,1] += 1
	return count_all
コード例 #14
0
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True):
    for fastaf in fastafile:
        ### seqence only needed for length here. MOODS does this parsing again later but without reporting length.
        thisseqname = fastaf.split('/')[-1].split('.')[0]
        thisseq = Bio.SeqIO.read(fastaf,
                                 "fasta",
                                 alphabet=IUPAC.unambiguous_dna)
        #thisseq = Bio.SeqIO.parse(thisseqname, "fasta", alphabet=IUPAC.unambiguous_dna)
        print 'Doing sequence ', thisseqname, 'length=', len(thisseq)

        for pwmf in pwmfiles:
            thispwmname = pwmf.split('/')[-1]
            thispwm = Motif.read(open(pwmf), "jaspar-pfm")
            print ' Doing scanPWM one strands for pwm ', thispwmname, ', length=', len(
                thispwm[0]), datetime.now()
            onestrandsindexvector = thispwm.scanPWM(thisseq.seq)
            x = onestrandsindexvector[0:len(thispwm) - 1].copy(
            )  # adding missing bp-values on the end to get the same length as seq.
            x[:] = np.NAN
            onestrandsindexvector = np.append(onestrandsindexvector, x)
            onestrandsindexvector = np.array([onestrandsindexvector
                                              ])  # takes long time.
            print '  bp with nan score  is ', np.isnan(
                onestrandsindexvector).sum(), ' expected ', (len(thispwm) - 1)
            print '  finding best score per bp, ', datetime.now()
            bestscorevector = getMaxPWMScore(onestrandsindexvector,
                                             len(thispwm))

            print '  writing wiggle for score per start index.', datetime.now()
            vegardswritewiggle(onestrandsindexvector[0, ],
                               name=thispwmname,
                               chr=thisseqname,
                               destpath=destdir + '/' + 'start_index_score/' +
                               thispwmname)

            print '  writing wiggle for bestscore. ', datetime.now()
            vegardswritewiggle(bestscorevector,
                               name=thispwmname,
                               chr=thisseqname,
                               destpath=destdir + '/' +
                               'best_score_in_window/' + thispwmname)
コード例 #15
0
# 1. <inputFileName>.png: The logo graphic.
#####################################################################################################################

import sys
import os
import math
from Bio import Motif

# Reading input
nucsPerImage = int(sys.argv[1])
inputFileLocation = sys.argv[2]
outputLocation = sys.argv[3]

# Reading pfm file
inputFile = open(inputFileLocation, "r")
pwm = Motif.read(inputFile, "jaspar-pfm")

# Writing whole or splited logo
if (nucsPerImage <= 0):
    pwm.weblogo(outputLocation +
                (inputFileLocation.split("/")[-1].split(".")[0]) + ".png",
                res=300)
else:
    tempPWM = [[], [], [], []]
    nucs = ["A", "C", "G", "T"]
    counter = 0
    fileCount = 0
    for i in range(0, len(pwm)):
        for j in range(0, len(nucs)):
            tempPWM[j].append(pwm.counts[nucs[j]][i])
        counter += 1
コード例 #16
0
def make_seq(seq, comp = False):
    if comp:
        return Seq(seq,Alphabet.IUPAC.unambiguous_dna).reverse_complement()
    else:
        return Seq(seq,Alphabet.IUPAC.unambiguous_dna) 

def score_seq(mot, seq, comp = False):
    return mot.scanPWM(make_seq(seq, comp = comp))[0]


tmp = u"""A 1 2 0 0 0 2 0 0 1 2 
C 1 1 0 0 5 0 1 0 1 0
G 4 4 8 8 2 4 5 6 6 0
T 2 1 0 0 1 2 2 2 0 6"""

sp1_mot = Motif.read(StringIO(tmp), 'jaspar-pfm')

# <codecell>

test_seqs = [('sp3', 'GAGGCGTGGC'),
             ('sp2', 'TGGGCGGGAC'),
             ('sp1', 'GGGGAGTGGC')]
res = []
for name, base_seq in test_seqs:
    bs = list(base_seq)
    
    mat = np.zeros((6, len(bs)+2))
    for n in range(len(bs)):
        olet = bs[n]
        for ln, let in enumerate('ACTG'):
            bs[n] = let
コード例 #17
0
##########testing .......
#calculate_both_strands=True
#outputdir = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/tempoutput'


#fastafile=['/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa']
#pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm']

#fastaf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'

fastaf = '/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa'
pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm'
destdir = outputdir



jaspar_file='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/JASPAR/all_data/matrix_only/matrix_only.txt'
thispwm = Motif.read(open(jaspar_file), "jaspar-pfm")

#pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm', '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest1.pfm']

#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/string_in_ex1_as_fasta.txt'
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/dummychrom.fasta'
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
#fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa'

#### test av biopythons motif pakke med scanPWM



print "ferdig i pwmtest4"
コード例 #18
0
                mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
                yield name, mot
                yield name+'-R', mot.reverse_complement()
pwm_dict = {}
for num, (name, mot) in enumerate(yield_motifs()):
    if num % 100 == 0:
        print num
    pwm_dict[name] = mot
    
    
tmp = u"""A 0  0 6 1 0 0 0 4 2 2 0 0 3 
C  1 1 1 0 5 6 4 1 0 0 0 3 5 5 4 0  
G  0 6 0 1 1 0 0 0 0 7 1 1 0 0 1 0 
T  6 0 0 0 1 1 3 5 7 0 0 0 0 2 2 4"""

pwm_dict['coup2'] = Motif.read(StringIO(tmp), 'jaspar-pfm')
pwm_dict['coup2-R'] = Motif.read(StringIO(tmp), 'jaspar-pfm').reverse_complement()

# <codecell>

from Bio.Alphabet import IUPAC

def score_seq(seq, mot):
    bseq = Seq(seq, alphabet=IUPAC.unambiguous_dna)
    scores = mot.scanPWM(bseq)
    for pos, score in enumerate(scores.flatten(),1):
        if ~np.isnan(score):
            tseq = seq[pos:pos+len(mot)]
            yield pos, tseq, score
    
コード例 #19
0
	def procurar(self,sequencia_string):
		'''
		Retorno = vetor de resultados
			    tamanho da sequencia	
		'''
		files = glob.glob( self.diretorio + "*.pfm")
		motivos_finais=[]
		nomes = open( self.diretorio +"matrix_list.txt", 'r').read()
		nomes_vetor = nomes.split('\n')
		
		#print nomes_vetor
		# print files
		#motif = Motif.read(open("PFMDir/1026_10858445.pfm"), "jaspar-pfm")
		# motif.make_instances_from_counts() 3
		lista_motivos = []
		lista_nomes = []
		motivos_checar_repetidos = {}
		for motivo in files:
		
		
			#print motivo
			isolar_nome = re.search(r'(\d*_\d*)\.pfm$', motivo)
			
			isolado = isolar_nome.group(1)
			
			
			for n_pesquisa in nomes_vetor:
				isolado_pesquisa = n_pesquisa.split('\t')
				id_encontrado = isolado_pesquisa[0]
				# print '<'+n_pesquisa+'>'
				if (id_encontrado == isolado):
					#print isolado
					nome_recolocar = n_pesquisa.split('\t')[2].split('_')[1]
						
					
					if nome_recolocar in motivos_checar_repetidos:
						motivos_checar_repetidos[nome_recolocar] += 1
						#print ('ja encontrado')
					else:
						#print motivo
						motivos_checar_repetidos[nome_recolocar] = 1
						lista_motivos.append(Motif.read(open(motivo), "jaspar-pfm"))
						lista_nomes.append(nome_recolocar)
		sequencia = Seq(sequencia_string.upper())
		#print lista_nomes
		#print(len(lista_motivos))
		# print motif.has_counts
		# print motif.counts
		contador = 0
		for converter in lista_motivos:
			#print contador
			#print converter.instances
			
			converter.make_instances_from_counts()
			contador += 1	
		
		# print motif.
		# motif.weblogo("teste.bmp")
		
		tamanho_seq=len(sequencia_string)
		tss=((tamanho_seq*self.pontas)/100)
		three_end= tamanho_seq-tss
		#print tss,three_end
		count_erro=0
		nome_contador=0
		saida_hmm=[]
		classificar=[]
		motif_Contador =1
		for procura in lista_motivos:
			#print  ("---------------->"  + str(motif_Contador) + "<---------------------------------------")
			#print (procura)
			motif_Contador += 1
			#for position_s,score_p in procura.search_pwm(sequencia,threshold=1):
				##print procura
				#print math.fabs(position_s),score_p,"teste"
				#print lista_nomes[nome_contador]
				#print '-----------------'
				#print count_errorandomseq.txt
				#count_erro=+ 1
			#print "----------------------------------------------"
			for pos, seq in procura.search_pwm(sequencia,threshold=10):
				#print teste
				#print "entrou"
				posicao=""
				if (pos<=tss):
					posicao="1"
				if (pos>=three_end)	:
					posicao="3"
				if (pos>tss and pos<three_end):
					posicao="2"
				motivos_finais.append((pos, seq ,lista_nomes[nome_contador],posicao))	
				#motivos_finais.append(str(pos) + "\t" + seq + "\t" +lista_nomes[nome_contador]+"\t"+posicao)
				saida_hmm.append((posicao,lista_nomes[nome_contador]))
				ordenar_teste=(posicao,lista_nomes[nome_contador])
				ordenar_teste=sorted(sorted(ordenar_teste))
				classificar.append(ordenar_teste)
				#print (str(pos) + "\t" + seq + "\t" +lista_nomes[nome_contador]+"\t"+ str(posicao))
				
			nome_contador+=1
		
		return (motivos_finais,tamanho_seq,lista_nomes,saida_hmm,classificar)
コード例 #20
0
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
#fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa'

#### test av biopythons motif pakke med scanPWM

#matrix = MOODS.load_matrix('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm') #9

from Bio import Motif
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from datetime import datetime

# Let's create an instance of the E2F1 motif (downloaded from the
# jaspar database):
testpwm = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm'
motif = Motif.read(open(testpwm), "jaspar-pfm")

# the format method displays the motif in a variety of formats:
print motif.format('transfac')

fastafile = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
fastafile = '/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa'
handle = open(fastafile, "r")
records = list(Bio.SeqIO.parse(handle, "fasta",
                               alphabet=IUPAC.unambiguous_dna))
handle.close()
thisseq = records[0].seq

print datetime.now()
hits = motif.scanPWM(thisseq)
print datetime.now()
コード例 #21
0
makePWMscorefiles(fastafile[0:1], pwmfiles[0:1], outputdir)
print datetime.now()

##########testing .......
#calculate_both_strands=True
#outputdir = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/tempoutput'

#fastafile=['/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa']
#pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm']

#fastaf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'

fastaf = '/usit/invitro/hyperbrowser/standardizedTracks/spombe2007/Sequence/DNA/chr1.fa'
#fastaf = ''
#pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm'
pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/matrix/JASPAR_CORE_2008/MA0086.pwf'
destdir = outputdir

jaspar_file = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/JASPAR/all_data/matrix_only/matrix_only.txt'
thispwm = Motif.read(open(jaspar_file), "jaspar-pfm")

#pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm', '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest1.pfm']

#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/string_in_ex1_as_fasta.txt'
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/dummychrom.fasta'
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
#fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa'

#### test av biopythons motif pakke med scanPWM

print "ferdig i pwmtest4"
コード例 #22
0
ファイル: test_Motif.py プロジェクト: andyoberlin/biopython
 def test_sites_parsing(self):
     """Test to be sure that Motif can parse sites files.
     """
     motif= Motif.read(self.SITESin,"jaspar-sites")
     assert motif.length==6
コード例 #23
0
def reverse(sequence):
    retSeq = []
    for c in sequence[::-1]:
        retSeq.append(revDict[c])
    return "".join(retSeq)


# Fetching sequences
bedName = bedFileName.split("/")[-1][:-4]
os.system("fastaFromBed -fi " + fastaFileName + " -fo " + outputLocation +
          bedName + ".txt" + " -bed " + bedFileName + " -tab")

# Reading pwm
pwmFile = open(pwmFileName, "r")
pwm = Motif.read(pwmFile, "jaspar-pfm")
motif = str(pwm.consensus()).upper()
pwmFile.close()

# Reading input vectors
misVec = []
misVecSpec = []
posVec = []
posVecSpec = []
scoreVec = []
bedFile = open(bedFileName, "r")
seqFile = open(outputLocation + bedName + ".txt", "r")
for bedLine in bedFile:

    # Reading line
    seqLine = seqFile.readline()
コード例 #24
0
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
#fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa'

#### test av biopythons motif pakke med scanPWM

#matrix = MOODS.load_matrix('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm') #9

from Bio import Motif
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from datetime import datetime

# Let's create an instance of the E2F1 motif (downloaded from the 
# jaspar database):
testpwm= '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm'
motif=Motif.read(open(testpwm), "jaspar-pfm")

# the format method displays the motif in a variety of formats:
print motif.format('transfac')

fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa'
handle = open(fastafile, "r")
records = list(Bio.SeqIO.parse(handle, "fasta", alphabet=IUPAC.unambiguous_dna))
handle.close()
thisseq = records[0].seq

print  datetime.now()
hits = motif.scanPWM(thisseq)
print  datetime.now()
コード例 #25
0
 def test_pfm_parsing(self):
     """Test to be sure that Motif can parse pfm  files.
     """
     motif = Motif.read(self.PFMin, "jaspar-pfm")
     assert motif.length == 12
コード例 #26
0
ファイル: get_logo.py プロジェクト: cauyrd/Emory
import sys
import random
import os
import numpy as np
from Bio import SeqIO
from Bio import Motif
from scipy.stats import fisher_exact
loc = '/compbio/data/motif/human-mouse/'
ifp = open('logolist.txt')
for line in ifp:
	name = line.rstrip()
	mat = np.loadtxt(loc+name)
	pfm = np.transpose(mat)
	np.savetxt(name,pfm,fmt='%d')
	mymotif = Motif.read(open(name),'jaspar-pfm')
	mymotif.weblogo(name+'.png')
コード例 #27
0
ファイル: seqprinter.py プロジェクト: daler/seqprint
    def __init__(self, regions, genome_fasta, jaspar_file=None,
                 jaspar_thresh=9999, annotations=None, motif_positions=None,
                 method='motility'):
        """
        Adds motif tracks to BasePrinter, using motility and a file containing
        a JASPAR-format definition of a motif.

        :param regions:
            An iterable of pybedtools.Interval objects

        :param genome_fasta:
            FASTA file from which sequences for `regions` will be extracted

        :param jaspar_file:
            If provided, a file in JASPAR format.  Motifs in each sequence will
            be identified

        :param jaspar_thresh:
            Score threshold below which motifs will be ignored.

        :param motif_positions:
            If this is a list of integer indexes, these positions will be
            converted to uppercase.

        :param method:
            "motility" or "biopython"
        """
        super(MotifPrinter, self).__init__(regions=regions,
                                           genome_fasta=genome_fasta)
        import motility
        assert method in ['biopython', 'motility']
        self.method = method
        pwm = list(helpers.pwm_from_jaspar(jaspar_file))
        assert len(pwm) == 1
        self.pwm = motility.PWM(pwm[0][1])

        tmp = open('tmp', 'w')
        for line in open(jaspar_file):
            if line.startswith('>'):
                continue
            for i in '[]ATCG':
                line = line.replace(i, '')
            tmp.write(line)
        tmp.close()
        self.motif = Motif.read(open(tmp.name), 'jaspar-pfm')

        if method == 'biopython':
            sd = Motif.ScoreDistribution(self.motif)
            jaspar_thresh = sd.threshold_patser()

        self.jaspar_thresh = jaspar_thresh

        if motif_positions is None:
            motif_positions = []
        self.motif_positions = motif_positions

        self._annotations = {}
        if annotations:
            for k, v in annotations.items():
                self._annotations[k] = pybedtools.BedTool(v).saveas()

        self.trackfuncs.append(self.motifs)
        self.trackfuncs.append(self.annotations)
        self.intervals = []
コード例 #28
0
            ostr += ' ' + str(count[l])
        ostr += '\n'
    count_dict[name] = ostr

# <codecell>

print(count_dict['COUP2'])

# <codecell>

from io import StringIO

motif_dict = {}
for key, tmp in count_dict.items():
    print key, type(tmp)
    motif_dict[key] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  0  0 16  5  3  0 16
C  1  0  2 12  0 15  0
G  0 15  0  1  1  3  1
T  17  3  0  0 14  0  1"""

motif_dict['AP1'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  3  1  4  2  4  2 18 18  0
C  0  1  1  9  2 15  0  0  6
G  0  4  6  2 10  0  0  0  2
T  15 12  7  5  2  1  0  0 10"""

motif_dict['CEBP'] = Motif.read(StringIO(tmp), 'jaspar-pfm')
コード例 #29
0
ファイル: test_Motif.py プロジェクト: andyoberlin/biopython
 def test_pfm_parsing(self):
     """Test to be sure that Motif can parse pfm  files.
     """
     motif= Motif.read(self.PFMin,"jaspar-pfm")
     assert motif.length==12
コード例 #30
0
ファイル: count_motif.py プロジェクト: cauyrd/Emory
from Bio import Motif
from scipy.stats import fisher_exact

def search_motif(mf,seq):
	"""search pwm for each motif in the motiflist form sequence"""
	cutoff = 0.8*mf.max_score()
	result = [(score,pos) for pos,score in mf.search_pwm(seq)]
	if not result:
		return None
	scores = [item[0] for item in result]
	pos = [item[1] for item in result]
	if max(scores) > cutoff:
		return pos[scores.index(max(scores))]
	else: 
		return None

count = 0
mf = Motif.read(open(sys.argv[1]),'jaspar-pfm')
ofp = open(sys.argv[2]+'.motif.fa','w')
for i,record in enumerate(SeqIO.parse(sys.argv[2],'fasta')):
	hit = search_motif(mf,record.seq) 
	if hit == None:
		continue
	else:
		record.id = record.id+'_'+str(hit)
		print >> ofp, '>'+record.id
		print >> ofp, record.seq
		count += 1
print str(i+1)+'\ttotal seq(s)'
print str(count)+'\tcontains motifs ('+str(count/float(i+1)*100)+'%)'
コード例 #31
0
 def test_sites_parsing(self):
     """Test to be sure that Motif can parse sites files.
     """
     motif = Motif.read(self.SITESin, "jaspar-sites")
     assert motif.length == 6
コード例 #32
0
ファイル: motif.py プロジェクト: dbgoodman/splicemod
    def __init__(self,
                 type,
                 score_type,
                 upstr=None,
                 invariant=[],
                 score_dict={},
                 bounds=None,
                 filter_score=None,
                 note_str_func=default_note_str,
                 # mutant_check= _mutant_check
                 ** attribs
                 ):
        self.type = type
        self.upstr = upstr
        self.invariant = invariant
        self.filter_score = filter_score
        self.note_str = note_str_func
        self.bounds = bounds
        self.score_type = score_type
        # self.mutant_check = mutant_check
        
        #------------------------------------------------
        # FIRST: figure out what score type this motif is.
        #------------------------------------------------
        
        # aho-corasick search tree
        if self.score_type == 'acora':
            # we need a score dict from a file and a separate acora object
            if not isinstance(score_dict, str):
                raise NeedScoreDictFileException
            self.score_dict = \
                dict([record.split() for record in open(score_dict)])
            
            self.acora_tree = acora.AcoraBuilder(self.score_dict.keys()).build()            
            self.score = self.acora
        
        # ternary search tree
        elif score_type == 'tst':
            # make our file of nmers and scores a tst object
            self.score_dict = tst.TST()
            tstmap = lambda tuple: self.score_dict.put(*tuple)
            map(tstmap, ([record.split() for record in open(score_dict)]))
            
            self.score = self.tst
            
        # max ent nmer score
        elif score_type == 'max_ent':
            self.score_dict = {}
            self.score = self.max_ent
            
            # open up an 'interactive' pipe to the maxent software for this motif
            programs = {'me_splice_donor':'score5', 'me_splice_acceptor':'score3'}

            self.command = cfg.programTemplate.substitute(path=cfg.maxEntPath,
                                        program=programs[self.type])           
            
        # positon frequency matrix list
        elif score_type == 'pfm':
            print "Loading position frequency matrices from {}...".format(self.type)
            pfm_glob = glob.glob(score_dict)
            name_pattern = re.compile('.*/(\w+).pfm')
            
            self.score_dict = {}
            
            for motif_file in pfm_glob:
                motif_obj = Motif.read(open(motif_file), 'jaspar-pfm')
                motif_name = name_pattern.match(motif_file).group(1)
                print "\t{}...".format(motif_name)
                motif_obj.name = motif_name
                if len(motif_obj) > 7:
                    self.score_dict[motif_name] = motif_obj
                    motif_obj.sd = \
                        ScoreDistribution(motif_obj, precision=10 ** 3)
                        
                    # low false-positive rate to make sure motifs are real
                    motif_obj.thresh = max(1, motif_obj.sd.threshold_fpr(0.01))            
            
            self.score = self.pfm
            print "Motif matrices done."

                
            
                
        #------------------------------------------------
        # SECOND: parse filter score information.
        #------------------------------------------------
        
        # if filter score is an int, make it a lambda function that determines
        # whether or not it is a 'worthwile' score; this could be as simple as 
        # > 0, or it could be a range, etc, etc. The lambda function will 
        # return true if the score should be kept and false if it should not.
        
        if isinstance(self.filter_score, float) or \
            isinstance(self.filter_score, int):
            
            self.filter_score = \
                lambda val, min = self.filter_score: \
                    val > min
        
        elif isinstance(self.filter_score, tuple):
            
            self.filter_score = \
                lambda val, minmax = self.filter_score: \
                    val < minmax[0] or val > minmax[1]
        
        # if filter_score is none, always return true
        elif self.filter_score == None:
            self.filter_score = lambda val: True
       
        #------------------------------------------------
        # THIRD: add motif to motif type dict and cleanup
        #------------------------------------------------ 
        motif_types[self.type] = self
        self.attribs = attribs