def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True): for fastaf in fastafile: ### seqence only needed for length here. MOODS does this parsing again later but without reporting length. thisseqname = fastaf.split('/')[-1].split('.')[0] thisseq = Bio.SeqIO.read(fastaf, "fasta", alphabet=IUPAC.unambiguous_dna) #thisseq = Bio.SeqIO.parse(thisseqname, "fasta", alphabet=IUPAC.unambiguous_dna) print 'Doing sequence ', thisseqname, 'length=', len(thisseq) for pwmf in pwmfiles: thispwmname = pwmf.split('/')[-1] thispwm = Motif.read(open(pwmf), "jaspar-pfm") print ' Doing scanPWM one strands for pwm ', thispwmname, ', length=', len(thispwm[0]), datetime.now() onestrandsindexvector = thispwm.scanPWM(thisseq.seq) x = onestrandsindexvector[0:len(thispwm)-1].copy() # adding missing bp-values on the end to get the same length as seq. x[:]=np.NAN onestrandsindexvector=np.append(onestrandsindexvector, x) onestrandsindexvector = np.array([onestrandsindexvector]) # takes long time. print ' bp with nan score is ', np.isnan(onestrandsindexvector).sum(), ' expected ', (len(thispwm)-1) print ' finding best score per bp, ', datetime.now() bestscorevector = getMaxPWMScore( onestrandsindexvector, len(thispwm)) print ' writing wiggle for score per start index.', datetime.now() vegardswritewiggle(onestrandsindexvector[0,], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname) print ' writing wiggle for bestscore. ', datetime.now() vegardswritewiggle(bestscorevector, name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
def readPwmFile(pwmFileName, outputLocation, pseudocounts=0.0): """Reads a PWM file in Jaspar format and returns a Biopython PWM object. Keyword arguments: pwmFileName -- The path + name of the PWM file. outputLocation -- Path to write temporary pseudocount-pwm PWM. pseudocounts -- Amount of pseudocounts to add in each matrix cell. (default 0.0) Returns: pwm -- Biopython PWM object. """ # Adding pseudocounts pwmFile = open(pwmFileName,"r"); tempFileName = outputLocation+pwmFileName.split("/")[-1]+"temp" pwmFileT = open(tempFileName,"w") for line in pwmFile: pwmFileT.write(" ".join([str(float(e)+pseudocounts) for e in line.strip().split(" ")])+"\n") pwmFile.close() pwmFileT.close() # Creating PWM from pseudocounted input pwmFile = open(tempFileName,"r") pwm = Motif.read(pwmFile,"jaspar-pfm") pwmFile.close() os.system("rm "+tempFileName) return pwm
def findPFM(jobID, motifObjList, wordObjDict, numMotifs, outputDir): """ find the PFM(Position Frequency Matrix) for the top motifs """ #write the words of motif in Jaspar site format like here: https://github.com/biopython/biopython/blob/master/Doc/cookbook/motif/Arnt.sites alphaList = ['A', 'C', 'G', 'T'] siteFileName = ''.join([jobID,'_jasparWordFile']) pfmFileName = ''.join([jobID,'_PFM']) pfmFile = open(pfmFileName, 'wb') counter = 1 #write the words for motifObj in motifObjList: seedWord = motifObj.seedWord siteFile = open(siteFileName, 'wb') for word in motifObj.wordList: wordCount = wordObjDict[word].O for i in range(int(wordCount)): siteFile.write('>site ' + str(counter) + '\n' + word + '\n') counter += 1 siteFile.close() srf = Motif.read(open(siteFileName),'jaspar-sites') srf.make_counts_from_instances() pfmFile.write('\n>' + seedWord + '\n') for alpha in alphaList: pfmFile.write(alpha + ' ' + str(srf.counts[alpha]) + '\n') shutil.move(pfmFileName, outputDir) os.remove(siteFileName) pfmFile.close() return
def draw_plot(motiffile): """generating histogram""" count = [] control = [] mf = Motif.read(open(motiffile),'jaspar-pfm') for record in SeqIO.parse(sys.argv[2],'fasta'): hit = search_motif(mf,record.seq) if hit == None: continue else: count.append(hit) for record in SeqIO.parse(sys.argv[3],'fasta'): hit = search_motif(mf,record.seq) if hit == None: continue else: control.append(hit) # assume the sequence length is 201, center base +/- 100bp pylab.figure() pylab.hist(count, np.linspace(-100,100,101),color='g') num, bin = np.histogram(control, np.linspace(-100,100,101)) pylab.plot(np.linspace(-100,100,100), num, color='r') pylab.xlabel('Distance relative to Stat5 motif') pylab.ylabel('No. Stat5 peaks') motifname = os.path.basename(motiffile) pylab.title(motifname.split('.')[0]) pylab.savefig(motifname.split('.')[0]+'.png')
def yield_motifs(): with open('/home/will/LTRtfAnalysis/Jaspar_PWMs.txt') as handle: for key, lines in groupby(handle, methodcaller('startswith', '>')): if key: name = lines.next().strip().split()[-1].lower() else: tmp = ''.join(lines) mot = Motif.read(StringIO(tmp), 'jaspar-pfm') yield name, mot yield name+'-R', mot.reverse_complement() tmp = u"""A 0 0 6 1 0 0 0 4 2 2 0 0 3 C 1 1 1 0 5 6 4 1 0 0 0 3 5 5 4 0 G 0 6 0 1 1 0 0 0 0 7 1 1 0 0 1 0 T 6 0 0 0 1 1 3 5 7 0 0 0 0 2 2 4""" mot = Motif.read(StringIO(tmp), 'jaspar-pfm') yield 'coup2', mot yield 'coup2-R', mot.reverse_complement()
def yield_motifs(): with open('/home/will/LTRtfAnalysis/Jaspar_PWMs.txt') as handle: for key, lines in groupby(handle, methodcaller('startswith', '>')): if key: name = lines.next().strip().split()[-1].lower() else: tmp = ''.join(lines) mot = Motif.read(StringIO(tmp), 'jaspar-pfm') yield name, mot yield name+'-R', mot.reverse_complement()
def _compute(self): windowLen = len(Motif.read(open(self._pfmFileName), "jaspar-pfm")) pwmScores = self._pwmScoreArrayStat.getResult() complementPwmScores = self._complementPwmScoreArrayStat.getResult() ret = np.zeros((windowLen*2, len(pwmScores)), dtype='float32') + np.float32(np.nan) for n in range(0, windowLen): ret[2*n,n:] = pwmScores[0:len(pwmScores)-n] ret[2*n + 1,n:] = complementPwmScores[0:len(complementPwmScores)-n] return np.nanmax(ret, axis=0)
def yield_motifs(): motifdir = '/home/will/Tip60Data/TFdata/' with open(motifdir + 'matrix_only.txt') as handle: for key, lines in groupby(handle, methodcaller('startswith', '>')): if key: name = lines.next().strip().split()[-1].lower() else: tmp = ''.join(lines) mot = Motif.read(StringIO(tmp), 'jaspar-pfm') yield name, mot yield name+'-R', mot.reverse_complement()
def _compute(self): sequence = self._sequenceStat.getResult().valsAsNumpyArray() bioSeq = Seq(sequence.tostring(), alphabet=IUPAC.unambiguous_dna) thisPwm = Motif.read(open(self._pfmFileName), "jaspar-pfm") if self._complement: thisPwm = thisPwm.reverse_complement() try: pwmScoreArray = thisPwm.scanPWM(bioSeq) except MemoryError, e: #when sequence is shorter than pwm return
def _compute(self): from Bio import Motif windowLen = len(Motif.read(open(self._pfmFileName), "jaspar-pfm")) pwmScores = self._pwmScoreArrayStat.getResult() complementPwmScores = self._complementPwmScoreArrayStat.getResult() ret = np.zeros((windowLen * 2, len(pwmScores)), dtype='float32') + np.float32(np.nan) for n in range(0, windowLen): ret[2 * n, n:] = pwmScores[0:len(pwmScores) - n] ret[2 * n + 1, n:] = complementPwmScores[0:len(complementPwmScores) - n] return np.nanmax(ret, axis=0)
def search_motif(motiflist, seq, col, extend): """search motif PWM from sequence list""" freq_list = [] shift_list = [] mf = Motif.read(open(motiflist),'jaspar-pfm') background = 0 for sequence,control in itertools.izip(seq, col): hit = [(pos,score) for pos,score in mf.search_pwm(sequence,threshold=7.0)] scores = np.array([score for (pos,score) in hit]) positions = np.array([pos for (pos,score) in hit]) if extend != 0: dist = [abs(extend-base) if base >=0 else abs(-1*extend-base) for base in positions] freq_list.append(len(scores)) shift_list += dist background += len([score for pos,score in mf.search_pwm(control,threshold=7.0)]) return freq_list, background, len(mf), shift_list
def _compute(self): from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio import Motif sequence = self._sequenceStat.getResult().valsAsNumpyArray() bioSeq = Seq(sequence.tostring(), alphabet=IUPAC.unambiguous_dna) thisPwm = Motif.read(open(self._pfmFileName), "jaspar-pfm") if self._complement: thisPwm = thisPwm.reverse_complement() try: pwmScoreArray = thisPwm.scanPWM(bioSeq) except MemoryError, e: #when sequence is shorter than pwm return
def search_motif(motiflist, seq1, seq2): """search pwm for each motif in the motiflist form sequence""" count_all = np.array([[0,0],[0,0]]) mf = Motif.read(open(motiflist),'jaspar-pfm') cutoff = 0.8*mf.max_score() for sequence in seq1: count = [(pos,score) for pos,score in mf.search_pwm(sequence)] max_score = max([j for i,j in count]) if count else -1e5 if max_score > cutoff: count_all[0,0] += 1 else: count_all[1,0] += 1 for sequence in seq2: count = [(pos,score) for pos,score in mf.search_pwm(sequence)] max_score3 = max([j for i,j in count]) if count else -1e5 if max_score > cutoff: count_all[0,1] += 1 else: count_all[1,1] += 1 return count_all
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True): for fastaf in fastafile: ### seqence only needed for length here. MOODS does this parsing again later but without reporting length. thisseqname = fastaf.split('/')[-1].split('.')[0] thisseq = Bio.SeqIO.read(fastaf, "fasta", alphabet=IUPAC.unambiguous_dna) #thisseq = Bio.SeqIO.parse(thisseqname, "fasta", alphabet=IUPAC.unambiguous_dna) print 'Doing sequence ', thisseqname, 'length=', len(thisseq) for pwmf in pwmfiles: thispwmname = pwmf.split('/')[-1] thispwm = Motif.read(open(pwmf), "jaspar-pfm") print ' Doing scanPWM one strands for pwm ', thispwmname, ', length=', len( thispwm[0]), datetime.now() onestrandsindexvector = thispwm.scanPWM(thisseq.seq) x = onestrandsindexvector[0:len(thispwm) - 1].copy( ) # adding missing bp-values on the end to get the same length as seq. x[:] = np.NAN onestrandsindexvector = np.append(onestrandsindexvector, x) onestrandsindexvector = np.array([onestrandsindexvector ]) # takes long time. print ' bp with nan score is ', np.isnan( onestrandsindexvector).sum(), ' expected ', (len(thispwm) - 1) print ' finding best score per bp, ', datetime.now() bestscorevector = getMaxPWMScore(onestrandsindexvector, len(thispwm)) print ' writing wiggle for score per start index.', datetime.now() vegardswritewiggle(onestrandsindexvector[0, ], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/' + thispwmname) print ' writing wiggle for bestscore. ', datetime.now() vegardswritewiggle(bestscorevector, name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/' + thispwmname)
# 1. <inputFileName>.png: The logo graphic. ##################################################################################################################### import sys import os import math from Bio import Motif # Reading input nucsPerImage = int(sys.argv[1]) inputFileLocation = sys.argv[2] outputLocation = sys.argv[3] # Reading pfm file inputFile = open(inputFileLocation, "r") pwm = Motif.read(inputFile, "jaspar-pfm") # Writing whole or splited logo if (nucsPerImage <= 0): pwm.weblogo(outputLocation + (inputFileLocation.split("/")[-1].split(".")[0]) + ".png", res=300) else: tempPWM = [[], [], [], []] nucs = ["A", "C", "G", "T"] counter = 0 fileCount = 0 for i in range(0, len(pwm)): for j in range(0, len(nucs)): tempPWM[j].append(pwm.counts[nucs[j]][i]) counter += 1
def make_seq(seq, comp = False): if comp: return Seq(seq,Alphabet.IUPAC.unambiguous_dna).reverse_complement() else: return Seq(seq,Alphabet.IUPAC.unambiguous_dna) def score_seq(mot, seq, comp = False): return mot.scanPWM(make_seq(seq, comp = comp))[0] tmp = u"""A 1 2 0 0 0 2 0 0 1 2 C 1 1 0 0 5 0 1 0 1 0 G 4 4 8 8 2 4 5 6 6 0 T 2 1 0 0 1 2 2 2 0 6""" sp1_mot = Motif.read(StringIO(tmp), 'jaspar-pfm') # <codecell> test_seqs = [('sp3', 'GAGGCGTGGC'), ('sp2', 'TGGGCGGGAC'), ('sp1', 'GGGGAGTGGC')] res = [] for name, base_seq in test_seqs: bs = list(base_seq) mat = np.zeros((6, len(bs)+2)) for n in range(len(bs)): olet = bs[n] for ln, let in enumerate('ACTG'): bs[n] = let
##########testing ....... #calculate_both_strands=True #outputdir = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/tempoutput' #fastafile=['/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa'] #pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm'] #fastaf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' fastaf = '/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa' pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm' destdir = outputdir jaspar_file='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/JASPAR/all_data/matrix_only/matrix_only.txt' thispwm = Motif.read(open(jaspar_file), "jaspar-pfm") #pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm', '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest1.pfm'] #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/string_in_ex1_as_fasta.txt' #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/dummychrom.fasta' #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' #fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa' #### test av biopythons motif pakke med scanPWM print "ferdig i pwmtest4"
mot = Motif.read(StringIO(tmp), 'jaspar-pfm') yield name, mot yield name+'-R', mot.reverse_complement() pwm_dict = {} for num, (name, mot) in enumerate(yield_motifs()): if num % 100 == 0: print num pwm_dict[name] = mot tmp = u"""A 0 0 6 1 0 0 0 4 2 2 0 0 3 C 1 1 1 0 5 6 4 1 0 0 0 3 5 5 4 0 G 0 6 0 1 1 0 0 0 0 7 1 1 0 0 1 0 T 6 0 0 0 1 1 3 5 7 0 0 0 0 2 2 4""" pwm_dict['coup2'] = Motif.read(StringIO(tmp), 'jaspar-pfm') pwm_dict['coup2-R'] = Motif.read(StringIO(tmp), 'jaspar-pfm').reverse_complement() # <codecell> from Bio.Alphabet import IUPAC def score_seq(seq, mot): bseq = Seq(seq, alphabet=IUPAC.unambiguous_dna) scores = mot.scanPWM(bseq) for pos, score in enumerate(scores.flatten(),1): if ~np.isnan(score): tseq = seq[pos:pos+len(mot)] yield pos, tseq, score
def procurar(self,sequencia_string): ''' Retorno = vetor de resultados tamanho da sequencia ''' files = glob.glob( self.diretorio + "*.pfm") motivos_finais=[] nomes = open( self.diretorio +"matrix_list.txt", 'r').read() nomes_vetor = nomes.split('\n') #print nomes_vetor # print files #motif = Motif.read(open("PFMDir/1026_10858445.pfm"), "jaspar-pfm") # motif.make_instances_from_counts() 3 lista_motivos = [] lista_nomes = [] motivos_checar_repetidos = {} for motivo in files: #print motivo isolar_nome = re.search(r'(\d*_\d*)\.pfm$', motivo) isolado = isolar_nome.group(1) for n_pesquisa in nomes_vetor: isolado_pesquisa = n_pesquisa.split('\t') id_encontrado = isolado_pesquisa[0] # print '<'+n_pesquisa+'>' if (id_encontrado == isolado): #print isolado nome_recolocar = n_pesquisa.split('\t')[2].split('_')[1] if nome_recolocar in motivos_checar_repetidos: motivos_checar_repetidos[nome_recolocar] += 1 #print ('ja encontrado') else: #print motivo motivos_checar_repetidos[nome_recolocar] = 1 lista_motivos.append(Motif.read(open(motivo), "jaspar-pfm")) lista_nomes.append(nome_recolocar) sequencia = Seq(sequencia_string.upper()) #print lista_nomes #print(len(lista_motivos)) # print motif.has_counts # print motif.counts contador = 0 for converter in lista_motivos: #print contador #print converter.instances converter.make_instances_from_counts() contador += 1 # print motif. # motif.weblogo("teste.bmp") tamanho_seq=len(sequencia_string) tss=((tamanho_seq*self.pontas)/100) three_end= tamanho_seq-tss #print tss,three_end count_erro=0 nome_contador=0 saida_hmm=[] classificar=[] motif_Contador =1 for procura in lista_motivos: #print ("---------------->" + str(motif_Contador) + "<---------------------------------------") #print (procura) motif_Contador += 1 #for position_s,score_p in procura.search_pwm(sequencia,threshold=1): ##print procura #print math.fabs(position_s),score_p,"teste" #print lista_nomes[nome_contador] #print '-----------------' #print count_errorandomseq.txt #count_erro=+ 1 #print "----------------------------------------------" for pos, seq in procura.search_pwm(sequencia,threshold=10): #print teste #print "entrou" posicao="" if (pos<=tss): posicao="1" if (pos>=three_end) : posicao="3" if (pos>tss and pos<three_end): posicao="2" motivos_finais.append((pos, seq ,lista_nomes[nome_contador],posicao)) #motivos_finais.append(str(pos) + "\t" + seq + "\t" +lista_nomes[nome_contador]+"\t"+posicao) saida_hmm.append((posicao,lista_nomes[nome_contador])) ordenar_teste=(posicao,lista_nomes[nome_contador]) ordenar_teste=sorted(sorted(ordenar_teste)) classificar.append(ordenar_teste) #print (str(pos) + "\t" + seq + "\t" +lista_nomes[nome_contador]+"\t"+ str(posicao)) nome_contador+=1 return (motivos_finais,tamanho_seq,lista_nomes,saida_hmm,classificar)
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' #fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa' #### test av biopythons motif pakke med scanPWM #matrix = MOODS.load_matrix('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm') #9 from Bio import Motif from Bio.Seq import Seq from Bio.Alphabet import IUPAC from datetime import datetime # Let's create an instance of the E2F1 motif (downloaded from the # jaspar database): testpwm = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm' motif = Motif.read(open(testpwm), "jaspar-pfm") # the format method displays the motif in a variety of formats: print motif.format('transfac') fastafile = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' fastafile = '/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa' handle = open(fastafile, "r") records = list(Bio.SeqIO.parse(handle, "fasta", alphabet=IUPAC.unambiguous_dna)) handle.close() thisseq = records[0].seq print datetime.now() hits = motif.scanPWM(thisseq) print datetime.now()
makePWMscorefiles(fastafile[0:1], pwmfiles[0:1], outputdir) print datetime.now() ##########testing ....... #calculate_both_strands=True #outputdir = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/tempoutput' #fastafile=['/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa'] #pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm'] #fastaf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' fastaf = '/usit/invitro/hyperbrowser/standardizedTracks/spombe2007/Sequence/DNA/chr1.fa' #fastaf = '' #pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm' pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/matrix/JASPAR_CORE_2008/MA0086.pwf' destdir = outputdir jaspar_file = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/JASPAR/all_data/matrix_only/matrix_only.txt' thispwm = Motif.read(open(jaspar_file), "jaspar-pfm") #pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm', '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest1.pfm'] #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/string_in_ex1_as_fasta.txt' #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/dummychrom.fasta' #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' #fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa' #### test av biopythons motif pakke med scanPWM print "ferdig i pwmtest4"
def test_sites_parsing(self): """Test to be sure that Motif can parse sites files. """ motif= Motif.read(self.SITESin,"jaspar-sites") assert motif.length==6
def reverse(sequence): retSeq = [] for c in sequence[::-1]: retSeq.append(revDict[c]) return "".join(retSeq) # Fetching sequences bedName = bedFileName.split("/")[-1][:-4] os.system("fastaFromBed -fi " + fastaFileName + " -fo " + outputLocation + bedName + ".txt" + " -bed " + bedFileName + " -tab") # Reading pwm pwmFile = open(pwmFileName, "r") pwm = Motif.read(pwmFile, "jaspar-pfm") motif = str(pwm.consensus()).upper() pwmFile.close() # Reading input vectors misVec = [] misVecSpec = [] posVec = [] posVecSpec = [] scoreVec = [] bedFile = open(bedFileName, "r") seqFile = open(outputLocation + bedName + ".txt", "r") for bedLine in bedFile: # Reading line seqLine = seqFile.readline()
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' #fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa' #### test av biopythons motif pakke med scanPWM #matrix = MOODS.load_matrix('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm') #9 from Bio import Motif from Bio.Seq import Seq from Bio.Alphabet import IUPAC from datetime import datetime # Let's create an instance of the E2F1 motif (downloaded from the # jaspar database): testpwm= '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm' motif=Motif.read(open(testpwm), "jaspar-pfm") # the format method displays the motif in a variety of formats: print motif.format('transfac') fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa' handle = open(fastafile, "r") records = list(Bio.SeqIO.parse(handle, "fasta", alphabet=IUPAC.unambiguous_dna)) handle.close() thisseq = records[0].seq print datetime.now() hits = motif.scanPWM(thisseq) print datetime.now()
def test_pfm_parsing(self): """Test to be sure that Motif can parse pfm files. """ motif = Motif.read(self.PFMin, "jaspar-pfm") assert motif.length == 12
import sys import random import os import numpy as np from Bio import SeqIO from Bio import Motif from scipy.stats import fisher_exact loc = '/compbio/data/motif/human-mouse/' ifp = open('logolist.txt') for line in ifp: name = line.rstrip() mat = np.loadtxt(loc+name) pfm = np.transpose(mat) np.savetxt(name,pfm,fmt='%d') mymotif = Motif.read(open(name),'jaspar-pfm') mymotif.weblogo(name+'.png')
def __init__(self, regions, genome_fasta, jaspar_file=None, jaspar_thresh=9999, annotations=None, motif_positions=None, method='motility'): """ Adds motif tracks to BasePrinter, using motility and a file containing a JASPAR-format definition of a motif. :param regions: An iterable of pybedtools.Interval objects :param genome_fasta: FASTA file from which sequences for `regions` will be extracted :param jaspar_file: If provided, a file in JASPAR format. Motifs in each sequence will be identified :param jaspar_thresh: Score threshold below which motifs will be ignored. :param motif_positions: If this is a list of integer indexes, these positions will be converted to uppercase. :param method: "motility" or "biopython" """ super(MotifPrinter, self).__init__(regions=regions, genome_fasta=genome_fasta) import motility assert method in ['biopython', 'motility'] self.method = method pwm = list(helpers.pwm_from_jaspar(jaspar_file)) assert len(pwm) == 1 self.pwm = motility.PWM(pwm[0][1]) tmp = open('tmp', 'w') for line in open(jaspar_file): if line.startswith('>'): continue for i in '[]ATCG': line = line.replace(i, '') tmp.write(line) tmp.close() self.motif = Motif.read(open(tmp.name), 'jaspar-pfm') if method == 'biopython': sd = Motif.ScoreDistribution(self.motif) jaspar_thresh = sd.threshold_patser() self.jaspar_thresh = jaspar_thresh if motif_positions is None: motif_positions = [] self.motif_positions = motif_positions self._annotations = {} if annotations: for k, v in annotations.items(): self._annotations[k] = pybedtools.BedTool(v).saveas() self.trackfuncs.append(self.motifs) self.trackfuncs.append(self.annotations) self.intervals = []
ostr += ' ' + str(count[l]) ostr += '\n' count_dict[name] = ostr # <codecell> print(count_dict['COUP2']) # <codecell> from io import StringIO motif_dict = {} for key, tmp in count_dict.items(): print key, type(tmp) motif_dict[key] = Motif.read(StringIO(tmp), 'jaspar-pfm') tmp = u"""A 0 0 16 5 3 0 16 C 1 0 2 12 0 15 0 G 0 15 0 1 1 3 1 T 17 3 0 0 14 0 1""" motif_dict['AP1'] = Motif.read(StringIO(tmp), 'jaspar-pfm') tmp = u"""A 3 1 4 2 4 2 18 18 0 C 0 1 1 9 2 15 0 0 6 G 0 4 6 2 10 0 0 0 2 T 15 12 7 5 2 1 0 0 10""" motif_dict['CEBP'] = Motif.read(StringIO(tmp), 'jaspar-pfm')
def test_pfm_parsing(self): """Test to be sure that Motif can parse pfm files. """ motif= Motif.read(self.PFMin,"jaspar-pfm") assert motif.length==12
from Bio import Motif from scipy.stats import fisher_exact def search_motif(mf,seq): """search pwm for each motif in the motiflist form sequence""" cutoff = 0.8*mf.max_score() result = [(score,pos) for pos,score in mf.search_pwm(seq)] if not result: return None scores = [item[0] for item in result] pos = [item[1] for item in result] if max(scores) > cutoff: return pos[scores.index(max(scores))] else: return None count = 0 mf = Motif.read(open(sys.argv[1]),'jaspar-pfm') ofp = open(sys.argv[2]+'.motif.fa','w') for i,record in enumerate(SeqIO.parse(sys.argv[2],'fasta')): hit = search_motif(mf,record.seq) if hit == None: continue else: record.id = record.id+'_'+str(hit) print >> ofp, '>'+record.id print >> ofp, record.seq count += 1 print str(i+1)+'\ttotal seq(s)' print str(count)+'\tcontains motifs ('+str(count/float(i+1)*100)+'%)'
def test_sites_parsing(self): """Test to be sure that Motif can parse sites files. """ motif = Motif.read(self.SITESin, "jaspar-sites") assert motif.length == 6
def __init__(self, type, score_type, upstr=None, invariant=[], score_dict={}, bounds=None, filter_score=None, note_str_func=default_note_str, # mutant_check= _mutant_check ** attribs ): self.type = type self.upstr = upstr self.invariant = invariant self.filter_score = filter_score self.note_str = note_str_func self.bounds = bounds self.score_type = score_type # self.mutant_check = mutant_check #------------------------------------------------ # FIRST: figure out what score type this motif is. #------------------------------------------------ # aho-corasick search tree if self.score_type == 'acora': # we need a score dict from a file and a separate acora object if not isinstance(score_dict, str): raise NeedScoreDictFileException self.score_dict = \ dict([record.split() for record in open(score_dict)]) self.acora_tree = acora.AcoraBuilder(self.score_dict.keys()).build() self.score = self.acora # ternary search tree elif score_type == 'tst': # make our file of nmers and scores a tst object self.score_dict = tst.TST() tstmap = lambda tuple: self.score_dict.put(*tuple) map(tstmap, ([record.split() for record in open(score_dict)])) self.score = self.tst # max ent nmer score elif score_type == 'max_ent': self.score_dict = {} self.score = self.max_ent # open up an 'interactive' pipe to the maxent software for this motif programs = {'me_splice_donor':'score5', 'me_splice_acceptor':'score3'} self.command = cfg.programTemplate.substitute(path=cfg.maxEntPath, program=programs[self.type]) # positon frequency matrix list elif score_type == 'pfm': print "Loading position frequency matrices from {}...".format(self.type) pfm_glob = glob.glob(score_dict) name_pattern = re.compile('.*/(\w+).pfm') self.score_dict = {} for motif_file in pfm_glob: motif_obj = Motif.read(open(motif_file), 'jaspar-pfm') motif_name = name_pattern.match(motif_file).group(1) print "\t{}...".format(motif_name) motif_obj.name = motif_name if len(motif_obj) > 7: self.score_dict[motif_name] = motif_obj motif_obj.sd = \ ScoreDistribution(motif_obj, precision=10 ** 3) # low false-positive rate to make sure motifs are real motif_obj.thresh = max(1, motif_obj.sd.threshold_fpr(0.01)) self.score = self.pfm print "Motif matrices done." #------------------------------------------------ # SECOND: parse filter score information. #------------------------------------------------ # if filter score is an int, make it a lambda function that determines # whether or not it is a 'worthwile' score; this could be as simple as # > 0, or it could be a range, etc, etc. The lambda function will # return true if the score should be kept and false if it should not. if isinstance(self.filter_score, float) or \ isinstance(self.filter_score, int): self.filter_score = \ lambda val, min = self.filter_score: \ val > min elif isinstance(self.filter_score, tuple): self.filter_score = \ lambda val, minmax = self.filter_score: \ val < minmax[0] or val > minmax[1] # if filter_score is none, always return true elif self.filter_score == None: self.filter_score = lambda val: True #------------------------------------------------ # THIRD: add motif to motif type dict and cleanup #------------------------------------------------ motif_types[self.type] = self self.attribs = attribs