Example #1
0
File: hist.py Project: cauyrd/MSI
def draw_plot(motiffile):
	"""generating histogram"""
	count = []
	control = []
	mf = Motif.read(open(motiffile),'jaspar-pfm')
	for record in SeqIO.parse(sys.argv[2],'fasta'):
		hit = search_motif(mf,record.seq) 
		if hit == None:
			continue
		else:
			count.append(hit)
	for record in SeqIO.parse(sys.argv[3],'fasta'):
		hit = search_motif(mf,record.seq) 
		if hit == None:
			continue
		else:
			control.append(hit)
	# assume the sequence length is 201, center base +/- 100bp
	pylab.figure()
	pylab.hist(count, np.linspace(-100,100,101),color='g')
	num, bin = np.histogram(control, np.linspace(-100,100,101))
	pylab.plot(np.linspace(-100,100,100), num, color='r')
	pylab.xlabel('Distance relative to Stat5 motif')
	pylab.ylabel('No. Stat5 peaks')
	motifname = os.path.basename(motiffile)
	pylab.title(motifname.split('.')[0])
	pylab.savefig(motifname.split('.')[0]+'.png')
Example #2
0
    def __call__(self, fasta):
        "Run the method."
        start_time = time.time()

        ensure_dir_exists(self.options.output_dir)

        predictions = []

        # run MEME
        self.meme_cmd_args, self.stdoutdata, self.starts, self.Zs, self.thetas, self.lambdas = run_meme(
            fasta, self.options)

        # parse output
        from Bio import Motif
        for motif in Motif.parse(open(os.path.join(self.options.output_dir, 'meme.txt')), "MEME"):
            for instance in motif.instances:
                # MEME parser seems to count from 1, not 0
                start = instance.start - 1
                prediction = instance.sequence_name, Interval(
                    start, start + motif.length), instance.strand == '-'
                predictions.append(prediction)

        logger.info('MEME took %.1f seconds', time.time() - start_time)

        return predictions
Example #3
0
def readPwmFile(pwmFileName, outputLocation, pseudocounts=0.0):
    """Reads a PWM file in Jaspar format and returns a Biopython PWM object.

    Keyword arguments:
    pwmFileName -- The path + name of the PWM file.
    outputLocation -- Path to write temporary pseudocount-pwm PWM.
    pseudocounts -- Amount of pseudocounts to add in each matrix cell. (default 0.0)

    Returns:
    pwm -- Biopython PWM object.
    """

    # Adding pseudocounts
    pwmFile = open(pwmFileName,"r");
    tempFileName = outputLocation+pwmFileName.split("/")[-1]+"temp"
    pwmFileT = open(tempFileName,"w")
    for line in pwmFile: pwmFileT.write(" ".join([str(float(e)+pseudocounts) for e in line.strip().split(" ")])+"\n")    
    pwmFile.close()
    pwmFileT.close()

    # Creating PWM from pseudocounted input
    pwmFile = open(tempFileName,"r")
    pwm = Motif.read(pwmFile,"jaspar-pfm")
    pwmFile.close()
    os.system("rm "+tempFileName)
    return pwm
Example #4
0
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True):
    for fastaf in fastafile:
        ### seqence only needed for length here. MOODS does this parsing again later but without reporting length.
        thisseqname = fastaf.split('/')[-1].split('.')[0]
        thisseq = Bio.SeqIO.read(fastaf, "fasta", alphabet=IUPAC.unambiguous_dna)
        #thisseq = Bio.SeqIO.parse(thisseqname, "fasta", alphabet=IUPAC.unambiguous_dna)
        print 'Doing sequence ', thisseqname, 'length=', len(thisseq)
        
        for pwmf in pwmfiles:
            thispwmname = pwmf.split('/')[-1]
            thispwm = Motif.read(open(pwmf), "jaspar-pfm")
            print ' Doing scanPWM one strands for pwm ', thispwmname, ', length=', len(thispwm[0]), datetime.now()
            onestrandsindexvector = thispwm.scanPWM(thisseq.seq)
            x = onestrandsindexvector[0:len(thispwm)-1].copy() # adding missing bp-values on the end to get the same length as seq.
            x[:]=np.NAN
            onestrandsindexvector=np.append(onestrandsindexvector, x)
            onestrandsindexvector = np.array([onestrandsindexvector]) # takes long time.
            print '  bp with nan score  is ', np.isnan(onestrandsindexvector).sum(), ' expected ', (len(thispwm)-1)
            print '  finding best score per bp, ', datetime.now()
            bestscorevector = getMaxPWMScore( onestrandsindexvector, len(thispwm)) 
            
            print '  writing wiggle for score per start index.', datetime.now()
            vegardswritewiggle(onestrandsindexvector[0,], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname)
            
            print '  writing wiggle for bestscore. ', datetime.now()
            vegardswritewiggle(bestscorevector, name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
Example #5
0
def findPFM(jobID, motifObjList, wordObjDict, numMotifs, outputDir):
    	""" find the PFM(Position Frequency Matrix) for the top motifs """
    	#write the words of motif in Jaspar site format like here: https://github.com/biopython/biopython/blob/master/Doc/cookbook/motif/Arnt.sites
    	alphaList = ['A', 'C', 'G', 'T']
    	siteFileName = ''.join([jobID,'_jasparWordFile'])
    	pfmFileName = ''.join([jobID,'_PFM'])
    	pfmFile = open(pfmFileName, 'wb')
    	counter = 1
    	#write the words
    	for motifObj in motifObjList:
    		seedWord = motifObj.seedWord
        	siteFile = open(siteFileName, 'wb')
        	for word in motifObj.wordList:
            		wordCount = wordObjDict[word].O
            		for i in range(int(wordCount)):
                		siteFile.write('>site ' + str(counter) + '\n' + word + '\n')
                		counter += 1
        	siteFile.close()
        
        	srf = Motif.read(open(siteFileName),'jaspar-sites')
        	srf.make_counts_from_instances()
        	pfmFile.write('\n>' + seedWord + '\n')
        	for alpha in alphaList:
        		pfmFile.write(alpha + ' ' + str(srf.counts[alpha]) + '\n')
	     
	shutil.move(pfmFileName, outputDir) 
    	os.remove(siteFileName)
    	pfmFile.close()
	return
def yield_motifs():
    with open('/home/will/LTRtfAnalysis/Jaspar_PWMs.txt') as handle:
        for key, lines in groupby(handle, methodcaller('startswith', '>')):
            if key:
                name = lines.next().strip().split()[-1].lower()
            else:
                tmp = ''.join(lines)
                mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
                yield name, mot
                yield name+'-R', mot.reverse_complement()
    tmp = u"""A 0  0 6 1 0 0 0 4 2 2 0 0 3 
C  1 1 1 0 5 6 4 1 0 0 0 3 5 5 4 0  
G  0 6 0 1 1 0 0 0 0 7 1 1 0 0 1 0 
T  6 0 0 0 1 1 3 5 7 0 0 0 0 2 2 4"""
    mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
    yield 'coup2', mot
    yield 'coup2-R', mot.reverse_complement()
def yield_motifs():
    with open('/home/will/LTRtfAnalysis/Jaspar_PWMs.txt') as handle:
        for key, lines in groupby(handle, methodcaller('startswith', '>')):
            if key:
                name = lines.next().strip().split()[-1].lower()
            else:
                tmp = ''.join(lines)
                mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
                yield name, mot
                yield name+'-R', mot.reverse_complement()
def yield_motifs():
    motifdir = '/home/will/Tip60Data/TFdata/'
    with open(motifdir + 'matrix_only.txt') as handle:
        for key, lines in groupby(handle, methodcaller('startswith', '>')):
            if key:
                name = lines.next().strip().split()[-1].lower()
            else:
                tmp = ''.join(lines)
                mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
                yield name, mot
                yield name+'-R', mot.reverse_complement()
 def _compute(self):
     windowLen = len(Motif.read(open(self._pfmFileName), "jaspar-pfm"))
     pwmScores = self._pwmScoreArrayStat.getResult()
     complementPwmScores = self._complementPwmScoreArrayStat.getResult()
     
     ret = np.zeros((windowLen*2, len(pwmScores)), dtype='float32') + np.float32(np.nan)
     for n in range(0, windowLen):
         ret[2*n,n:] = pwmScores[0:len(pwmScores)-n]
         ret[2*n + 1,n:] = complementPwmScores[0:len(complementPwmScores)-n]
         
     return np.nanmax(ret, axis=0)
Example #10
0
 def setUp(self):
     self.ACin = open("Motif/alignace.out")
     self.MEMEin = open("Motif/meme.out")
     self.PFMin = open("Motif/SRF.pfm")
     self.SITESin = open("Motif/Arnt.sites")
     self.TFout = "Motif/tf.out"
     self.FAout = "Motif/fa.out"
     self.PFMout = "Motif/fa.out"
     from Bio.Seq import Seq
     self.m = Motif.Motif()
     self.m.add_instance(Seq("ATATA", self.m.alphabet))
Example #11
0
def build_motif(seqs):
    """Create motif from sequences"""
    m = Motif.Motif(alphabet=IUPAC.unambiguous_dna)
    for seq in seqs:
        try:
            m.add_instance(Seq(seq, m.alphabet))
        except:
            print "Diff motif size length?"
            return None
    m.make_counts_from_instances()
    return m
    def _compute(self):
        sequence = self._sequenceStat.getResult().valsAsNumpyArray()
        bioSeq = Seq(sequence.tostring(), alphabet=IUPAC.unambiguous_dna)

        thisPwm = Motif.read(open(self._pfmFileName), "jaspar-pfm")
        if self._complement:
            thisPwm = thisPwm.reverse_complement()
            
        try:
            pwmScoreArray = thisPwm.scanPWM(bioSeq)
        except MemoryError, e: #when sequence is shorter than pwm
            return
Example #13
0
def parse_meme_output_for_sites(meme_output):
    "Parse MEME-like output"
    logging.info('Parsing predictions from %s', meme_output)
    predicted_sites = defaultdict(P.IntIntervalSet)
    motifs = list(Motif.parse(open(meme_output), "MEME"))
    for motif in motifs:
        for instance in motif.instances:
            logging.info('Prediction: sequence = %s; site = %s; pos = %3d',
                         instance.sequence_name, instance, instance.start)
            predicted_sites[instance.sequence_name].add(
                P.IntInterval(instance.start, instance.start + len(instance)))
    return predicted_sites
Example #14
0
    def _compute(self):
        from Bio import Motif

        windowLen = len(Motif.read(open(self._pfmFileName), "jaspar-pfm"))
        pwmScores = self._pwmScoreArrayStat.getResult()
        complementPwmScores = self._complementPwmScoreArrayStat.getResult()

        ret = np.zeros((windowLen * 2, len(pwmScores)),
                       dtype='float32') + np.float32(np.nan)
        for n in range(0, windowLen):
            ret[2 * n, n:] = pwmScores[0:len(pwmScores) - n]
            ret[2 * n + 1,
                n:] = complementPwmScores[0:len(complementPwmScores) - n]

        return np.nanmax(ret, axis=0)
Example #15
0
def search_motif(motiflist, seq, col, extend):
	"""search motif PWM from sequence list"""
	freq_list = []
	shift_list = []
	mf = Motif.read(open(motiflist),'jaspar-pfm')
	background = 0
	for sequence,control in itertools.izip(seq, col):
		hit = [(pos,score) for pos,score in mf.search_pwm(sequence,threshold=7.0)]
		scores = np.array([score for (pos,score) in hit])
		positions = np.array([pos for (pos,score) in hit])
		if extend != 0:
			dist = [abs(extend-base) if base >=0 else abs(-1*extend-base) for base in positions]
		freq_list.append(len(scores))
		shift_list += dist
		background += len([score for pos,score in mf.search_pwm(control,threshold=7.0)])
	return freq_list, background, len(mf), shift_list
    def _compute(self):
        from Bio.Alphabet import IUPAC
        from Bio.Seq import Seq
        from Bio import Motif

        sequence = self._sequenceStat.getResult().valsAsNumpyArray()
        bioSeq = Seq(sequence.tostring(), alphabet=IUPAC.unambiguous_dna)

        thisPwm = Motif.read(open(self._pfmFileName), "jaspar-pfm")
        if self._complement:
            thisPwm = thisPwm.reverse_complement()

        try:
            pwmScoreArray = thisPwm.scanPWM(bioSeq)
        except MemoryError, e:  #when sequence is shorter than pwm
            return
Example #17
0
def search_motif(motiflist, seq1, seq2):
	"""search pwm for each motif in the motiflist form sequence"""
	count_all = np.array([[0,0],[0,0]])
	mf = Motif.read(open(motiflist),'jaspar-pfm')
	cutoff = 0.8*mf.max_score()
	for sequence in seq1:
		count = [(pos,score) for pos,score in mf.search_pwm(sequence)]
		max_score = max([j for i,j in count]) if count else -1e5
		if max_score > cutoff:
			count_all[0,0] += 1
		else: 
			count_all[1,0] += 1
	for sequence in seq2:
		count = [(pos,score) for pos,score in mf.search_pwm(sequence)]
		max_score3 = max([j for i,j in count]) if count else -1e5
		if max_score > cutoff:
			count_all[0,1] += 1
		else: 
			count_all[1,1] += 1
	return count_all
Example #18
0
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True):
    for fastaf in fastafile:
        ### seqence only needed for length here. MOODS does this parsing again later but without reporting length.
        thisseqname = fastaf.split('/')[-1].split('.')[0]
        thisseq = Bio.SeqIO.read(fastaf,
                                 "fasta",
                                 alphabet=IUPAC.unambiguous_dna)
        #thisseq = Bio.SeqIO.parse(thisseqname, "fasta", alphabet=IUPAC.unambiguous_dna)
        print 'Doing sequence ', thisseqname, 'length=', len(thisseq)

        for pwmf in pwmfiles:
            thispwmname = pwmf.split('/')[-1]
            thispwm = Motif.read(open(pwmf), "jaspar-pfm")
            print ' Doing scanPWM one strands for pwm ', thispwmname, ', length=', len(
                thispwm[0]), datetime.now()
            onestrandsindexvector = thispwm.scanPWM(thisseq.seq)
            x = onestrandsindexvector[0:len(thispwm) - 1].copy(
            )  # adding missing bp-values on the end to get the same length as seq.
            x[:] = np.NAN
            onestrandsindexvector = np.append(onestrandsindexvector, x)
            onestrandsindexvector = np.array([onestrandsindexvector
                                              ])  # takes long time.
            print '  bp with nan score  is ', np.isnan(
                onestrandsindexvector).sum(), ' expected ', (len(thispwm) - 1)
            print '  finding best score per bp, ', datetime.now()
            bestscorevector = getMaxPWMScore(onestrandsindexvector,
                                             len(thispwm))

            print '  writing wiggle for score per start index.', datetime.now()
            vegardswritewiggle(onestrandsindexvector[0, ],
                               name=thispwmname,
                               chr=thisseqname,
                               destpath=destdir + '/' + 'start_index_score/' +
                               thispwmname)

            print '  writing wiggle for bestscore. ', datetime.now()
            vegardswritewiggle(bestscorevector,
                               name=thispwmname,
                               chr=thisseqname,
                               destpath=destdir + '/' +
                               'best_score_in_window/' + thispwmname)
Example #19
0
def get_pwm_from_clustalw(clustalw_fname):
    """
    Get PWM from CLUSTALW alignments file.

    Return PWM and motif object.
    """
    from Bio import Motif
    from Bio.Alphabet import IUPAC
    import Bio.Seq as bio_seq
    import Bio.AlignIO as align_io
    # Load CLUSTALW file
    if not os.path.isfile(clustalw_fname):
        raise Exception, "CLUSTALW file %s does not exist" % (clustalw_fname)
    clustalw_input = align_io.read(clustalw_fname, "clustal")
    motif_obj = Motif.Motif(alphabet=IUPAC.unambiguous_dna)
    # Add sequences from CLUSTALW alignment to motif object
    for clustalw_seq in clustalw_input.get_all_seqs():
        curr_seq = bio_seq.Seq(str(clustalw_seq.seq), IUPAC.unambiguous_dna)
        motif_obj.add_instance(curr_seq)
    # Compute PWM
    pwm = motif_obj.pwm()
    return pwm, motif_obj
Example #20
0
import sys
import random
import os
import numpy as np
from Bio import SeqIO
from Bio import Motif
from scipy.stats import fisher_exact
loc = '/compbio/data/motif/human-mouse/'
ifp = open('logolist.txt')
for line in ifp:
	name = line.rstrip()
	mat = np.loadtxt(loc+name)
	pfm = np.transpose(mat)
	np.savetxt(name,pfm,fmt='%d')
	mymotif = Motif.read(open(name),'jaspar-pfm')
	mymotif.weblogo(name+'.png')
Example #21
0
 def test_pfm_parsing(self):
     """Test to be sure that Motif can parse pfm  files.
     """
     motif = Motif.Motif()
     motif.from_jaspar_pfm(self.PFMin)
     assert motif.length == 12
Example #22
0
 def test_alignace_parsing(self):
     """Test to be sure that Motif can parse AlignAce output files.
     """
     parser = Motif.AlignAceParser()
     record = parser.parse(self.ACin)
     assert len(record.motifs) == 16
Example #23
0
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
#fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa'

#### test av biopythons motif pakke med scanPWM

#matrix = MOODS.load_matrix('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm') #9

from Bio import Motif
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from datetime import datetime

# Let's create an instance of the E2F1 motif (downloaded from the 
# jaspar database):
testpwm= '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm'
motif=Motif.read(open(testpwm), "jaspar-pfm")

# the format method displays the motif in a variety of formats:
print motif.format('transfac')

fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa'
handle = open(fastafile, "r")
records = list(Bio.SeqIO.parse(handle, "fasta", alphabet=IUPAC.unambiguous_dna))
handle.close()
thisseq = records[0].seq

print  datetime.now()
hits = motif.scanPWM(thisseq)
print  datetime.now()
Example #24
0
 def test_pfm_parsing(self):
     """Test to be sure that Motif can parse pfm  files.
     """
     motif= Motif.read(self.PFMin,"jaspar-pfm")
     assert motif.length==12
Example #25
0
    def __init__(self, regions, genome_fasta, jaspar_file=None,
                 jaspar_thresh=9999, annotations=None, motif_positions=None,
                 method='motility'):
        """
        Adds motif tracks to BasePrinter, using motility and a file containing
        a JASPAR-format definition of a motif.

        :param regions:
            An iterable of pybedtools.Interval objects

        :param genome_fasta:
            FASTA file from which sequences for `regions` will be extracted

        :param jaspar_file:
            If provided, a file in JASPAR format.  Motifs in each sequence will
            be identified

        :param jaspar_thresh:
            Score threshold below which motifs will be ignored.

        :param motif_positions:
            If this is a list of integer indexes, these positions will be
            converted to uppercase.

        :param method:
            "motility" or "biopython"
        """
        super(MotifPrinter, self).__init__(regions=regions,
                                           genome_fasta=genome_fasta)
        import motility
        assert method in ['biopython', 'motility']
        self.method = method
        pwm = list(helpers.pwm_from_jaspar(jaspar_file))
        assert len(pwm) == 1
        self.pwm = motility.PWM(pwm[0][1])

        tmp = open('tmp', 'w')
        for line in open(jaspar_file):
            if line.startswith('>'):
                continue
            for i in '[]ATCG':
                line = line.replace(i, '')
            tmp.write(line)
        tmp.close()
        self.motif = Motif.read(open(tmp.name), 'jaspar-pfm')

        if method == 'biopython':
            sd = Motif.ScoreDistribution(self.motif)
            jaspar_thresh = sd.threshold_patser()

        self.jaspar_thresh = jaspar_thresh

        if motif_positions is None:
            motif_positions = []
        self.motif_positions = motif_positions

        self._annotations = {}
        if annotations:
            for k, v in annotations.items():
                self._annotations[k] = pybedtools.BedTool(v).saveas()

        self.trackfuncs.append(self.motifs)
        self.trackfuncs.append(self.annotations)
        self.intervals = []
Example #26
0
##########testing .......
#calculate_both_strands=True
#outputdir = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/tempoutput'


#fastafile=['/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa']
#pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm']

#fastaf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'

fastaf = '/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa'
pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm'
destdir = outputdir



jaspar_file='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/JASPAR/all_data/matrix_only/matrix_only.txt'
thispwm = Motif.read(open(jaspar_file), "jaspar-pfm")

#pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm', '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest1.pfm']

#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/string_in_ex1_as_fasta.txt'
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/dummychrom.fasta'
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
#fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa'

#### test av biopythons motif pakke med scanPWM



print "ferdig i pwmtest4"
Example #27
0
makePWMscorefiles(fastafile[0:1], pwmfiles[0:1], outputdir)
print datetime.now()

##########testing .......
#calculate_both_strands=True
#outputdir = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/tempoutput'

#fastafile=['/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa']
#pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm']

#fastaf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'

fastaf = '/usit/invitro/hyperbrowser/standardizedTracks/spombe2007/Sequence/DNA/chr1.fa'
#fastaf = ''
#pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm'
pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/matrix/JASPAR_CORE_2008/MA0086.pwf'
destdir = outputdir

jaspar_file = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/JASPAR/all_data/matrix_only/matrix_only.txt'
thispwm = Motif.read(open(jaspar_file), "jaspar-pfm")

#pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm', '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest1.pfm']

#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/string_in_ex1_as_fasta.txt'
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/dummychrom.fasta'
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
#fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa'

#### test av biopythons motif pakke med scanPWM

print "ferdig i pwmtest4"
Example #28
0
options = stempy.get_default_options()
options.output_dir = os.path.join('output', 'test-meme-like-output')
options.min_w = options.max_w = 8
options.meme_like_output = 'meme.out'
algorithm = stempy.Algorithm(options)
fasta = os.path.join(os.path.dirname(__file__), 'fasta', 'T00759-tiny.fa')
algorithm(fasta)
logging.info('Showing MEME output from %s', algorithm.meme_like_output_file)
os.system('cat %s' % algorithm.meme_like_output_file)


#
# Test BioPython parser
#
from Bio import Motif
motifs = list(Motif.parse(open(algorithm.meme_like_output_file), "MEME"))


#
# Doesn't quite work with pycogent yet. Pycogent expects a summary section
# that contains sites in all the sequences 
#
#from cogent import LoadSeqs
#from cogent.parse.meme import MemeParser
#results = MemeParser(open(algorithm.meme_like_output_file, 'U'))
#seqs = LoadSeqs(fasta, aligned=False)
#results.Alignment = seqs
#for motif in results.Motifs:
#    module = motif.Modules[0]
#    print module.ID, module.Evalue, len(module.NamedSeqs)
#
Example #29
0
            ostr += ' ' + str(count[l])
        ostr += '\n'
    count_dict[name] = ostr

# <codecell>

print(count_dict['COUP2'])

# <codecell>

from io import StringIO

motif_dict = {}
for key, tmp in count_dict.items():
    print key, type(tmp)
    motif_dict[key] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  0  0 16  5  3  0 16
C  1  0  2 12  0 15  0
G  0 15  0  1  1  3  1
T  17  3  0  0 14  0  1"""

motif_dict['AP1'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  3  1  4  2  4  2 18 18  0
C  0  1  1  9  2 15  0  0  6
G  0  4  6  2 10  0  0  0  2
T  15 12  7  5  2  1  0  0 10"""

motif_dict['CEBP'] = Motif.read(StringIO(tmp), 'jaspar-pfm')
Example #30
0
 def test_sites_parsing(self):
     """Test to be sure that Motif can parse sites files.
     """
     motif = Motif.read(self.SITESin, "jaspar-sites")
     assert motif.length == 6
Example #31
0
 def test_pfm_parsing(self):
     """Test to be sure that Motif can parse pfm  files.
     """
     motif = Motif.read(self.PFMin, "jaspar-pfm")
     assert motif.length == 12
Example #32
0
    def __init__(self,
                 type,
                 score_type,
                 upstr=None,
                 invariant=[],
                 score_dict={},
                 bounds=None,
                 filter_score=None,
                 note_str_func=default_note_str,
                 # mutant_check= _mutant_check
                 ** attribs
                 ):
        self.type = type
        self.upstr = upstr
        self.invariant = invariant
        self.filter_score = filter_score
        self.note_str = note_str_func
        self.bounds = bounds
        self.score_type = score_type
        # self.mutant_check = mutant_check
        
        #------------------------------------------------
        # FIRST: figure out what score type this motif is.
        #------------------------------------------------
        
        # aho-corasick search tree
        if self.score_type == 'acora':
            # we need a score dict from a file and a separate acora object
            if not isinstance(score_dict, str):
                raise NeedScoreDictFileException
            self.score_dict = \
                dict([record.split() for record in open(score_dict)])
            
            self.acora_tree = acora.AcoraBuilder(self.score_dict.keys()).build()            
            self.score = self.acora
        
        # ternary search tree
        elif score_type == 'tst':
            # make our file of nmers and scores a tst object
            self.score_dict = tst.TST()
            tstmap = lambda tuple: self.score_dict.put(*tuple)
            map(tstmap, ([record.split() for record in open(score_dict)]))
            
            self.score = self.tst
            
        # max ent nmer score
        elif score_type == 'max_ent':
            self.score_dict = {}
            self.score = self.max_ent
            
            # open up an 'interactive' pipe to the maxent software for this motif
            programs = {'me_splice_donor':'score5', 'me_splice_acceptor':'score3'}

            self.command = cfg.programTemplate.substitute(path=cfg.maxEntPath,
                                        program=programs[self.type])           
            
        # positon frequency matrix list
        elif score_type == 'pfm':
            print "Loading position frequency matrices from {}...".format(self.type)
            pfm_glob = glob.glob(score_dict)
            name_pattern = re.compile('.*/(\w+).pfm')
            
            self.score_dict = {}
            
            for motif_file in pfm_glob:
                motif_obj = Motif.read(open(motif_file), 'jaspar-pfm')
                motif_name = name_pattern.match(motif_file).group(1)
                print "\t{}...".format(motif_name)
                motif_obj.name = motif_name
                if len(motif_obj) > 7:
                    self.score_dict[motif_name] = motif_obj
                    motif_obj.sd = \
                        ScoreDistribution(motif_obj, precision=10 ** 3)
                        
                    # low false-positive rate to make sure motifs are real
                    motif_obj.thresh = max(1, motif_obj.sd.threshold_fpr(0.01))            
            
            self.score = self.pfm
            print "Motif matrices done."

                
            
                
        #------------------------------------------------
        # SECOND: parse filter score information.
        #------------------------------------------------
        
        # if filter score is an int, make it a lambda function that determines
        # whether or not it is a 'worthwile' score; this could be as simple as 
        # > 0, or it could be a range, etc, etc. The lambda function will 
        # return true if the score should be kept and false if it should not.
        
        if isinstance(self.filter_score, float) or \
            isinstance(self.filter_score, int):
            
            self.filter_score = \
                lambda val, min = self.filter_score: \
                    val > min
        
        elif isinstance(self.filter_score, tuple):
            
            self.filter_score = \
                lambda val, minmax = self.filter_score: \
                    val < minmax[0] or val > minmax[1]
        
        # if filter_score is none, always return true
        elif self.filter_score == None:
            self.filter_score = lambda val: True
       
        #------------------------------------------------
        # THIRD: add motif to motif type dict and cleanup
        #------------------------------------------------ 
        motif_types[self.type] = self
        self.attribs = attribs
Example #33
0
#!/usr/bin/env python
# counts all dinucleotides in a DNA fasta file
import sys
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import Motif
from Bio.Alphabet import IUPAC
fastafile = sys.argv[1]

AA=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
AA.add_instance(Seq("AA",AA.alphabet))
CA=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
CA.add_instance(Seq("CA",CA.alphabet))
GA=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
GA.add_instance(Seq("GA",GA.alphabet))
TA=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
TA.add_instance(Seq("TA",TA.alphabet))
AC=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
AC.add_instance(Seq("AC",AC.alphabet))
CC=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
CC.add_instance(Seq("CC",CC.alphabet))
GC=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
GC.add_instance(Seq("GC",GC.alphabet))
TC=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
TC.add_instance(Seq("TC",TC.alphabet))
AG=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
AG.add_instance(Seq("AG",AG.alphabet))
CG=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
CG.add_instance(Seq("CG",CG.alphabet))
GG=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
GG.add_instance(Seq("GG",GG.alphabet))
Example #34
0
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
#fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa'

#### test av biopythons motif pakke med scanPWM

#matrix = MOODS.load_matrix('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm') #9

from Bio import Motif
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from datetime import datetime

# Let's create an instance of the E2F1 motif (downloaded from the
# jaspar database):
testpwm = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm'
motif = Motif.read(open(testpwm), "jaspar-pfm")

# the format method displays the motif in a variety of formats:
print motif.format('transfac')

fastafile = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt'
fastafile = '/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa'
handle = open(fastafile, "r")
records = list(Bio.SeqIO.parse(handle, "fasta",
                               alphabet=IUPAC.unambiguous_dna))
handle.close()
thisseq = records[0].seq

print datetime.now()
hits = motif.scanPWM(thisseq)
print datetime.now()
Example #35
0
def bitScoreMM(pwmFileName,
               genomeDict,
               mpbsDict,
               scoringMethod,
               tempLocation,
               pseudocounts=0.1,
               bitscore=12.0,
               fpr=0.01,
               precision=10**4,
               highCutoff=0.7,
               functionalDepth=0.9):
    """Performs basic motif matching algorithm and writes the results to a dictionary indexed by chromosome.

    Keyword arguments:
    pwmFileName -- PWM file name.
    genomeDict -- Genome dictionary.
    mpbsDict -- Dictionary of MPBSs to insert the results.
    scoringMethod -- Method to evaluate which MPBSs are enriched.
    tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting.
    pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1)
    bitscore -- The cutoff bitscore value. (default 12.0)
    fpr -- False positive rate to determine the cutoff value. (default 0.01)
    precision -- Motif score distribution precision. (default 10**4)
    highCutoff -- High cutoff for Boyle's rule. (default 0.7)
    functionalDepth -- Functional depth for Boyle's rule. (default 0.9)

    Returns:
    mpbsDict -- This method inserts entries on the mpbsDict.
    """

    # Reading PWM
    pwm = createPwmDict(pwmFileName, pseudocounts)
    pwmName = pwmFileName.split("/")[-1].split(".")[0]
    pwmLen = len(pwm["A"])
    background = math.log(0.25, 2) * pwmLen

    # Evaluating threshold
    pwmThreshold = 0.0
    if (scoringMethod == "bitscore"):
        pwmThreshold = bitscore
    elif (scoringMethod == "fpr"):
        bioPwm = biopythonMM.readPwmFile(pwmFileName, tempLocation,
                                         pseudocounts)
        sd = Motif.ScoreDistribution(bioPwm, precision=precision)
        pwmThreshold = sd.threshold_fpr(fpr)
    elif (scoringMethod == "boyle"):
        maxScore = 0.0
        minScore = 0.0  # TODO Boyle's rule is not suited for negative values.
        for i in range(0, pwmLen):
            maxScore += max(pwm["A"][i], pwm["C"][i], pwm["G"][i], pwm["T"][i])
        maxScore -= background
        pwmThreshold = min(highCutoff * maxScore,
                           functionalDepth * (maxScore - minScore))
    else:
        sys.stderr.write("Choose a valid scoring method.\n")
        sys.exit(0)

    # Creating aditional parameters
    chrList = constants.getChromList(reference=[mpbsDict])
    tempMpbsDict = dict([(e, []) for e in chrList])
    maxValue = -99.0
    revDict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"),
                    ("N", "N")])

    # Iterating on chromosomes
    for chrName in chrList:

        # Reading genome
        sequence = genomeDict[chrName].upper()

        # Performing motif matching
        for pos in xrange(0, len(sequence) - pwmLen + 1):
            scoreF = -background
            scoreR = -background
            for i in range(0, pwmLen):
                scoreF += pwm[sequence[pos + i]][i]
                scoreR += pwm[revDict[sequence[pos + pwmLen - i - 1]]][i]
            if (scoreF > pwmThreshold):
                if (scoreF > maxValue): maxValue = scoreF
                tempMpbsDict[chrName].append(
                    [pos, pos + pwmLen, pwmName, scoreF, "+"])
            if (scoreR > pwmThreshold):
                if (scoreR > maxValue): maxValue = scoreR
                tempMpbsDict[chrName].append(
                    [pos, pos + pwmLen, pwmName, scoreR, "-"])

    # Update scores - new scores are within [0,1000]
    for chrName in chrList:
        for e in tempMpbsDict[chrName]:
            mpbsDict[chrName].append([
                e[0], e[1], e[2],
                int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)),
                e[4]
            ])

    return 0
Example #36
0
def biopythonMM(pwmFileName,
                genomeDict,
                mpbsDict,
                scoringMethod,
                tempLocation,
                pseudocounts=0.1,
                bitscore=12.0,
                fpr=0.01,
                precision=10**4,
                highCutoff=0.7,
                functionalDepth=0.9):
    """Performs Biopython based motif matching and writes the results to a dictionary indexed by chromosome.

    Keyword arguments:
    pwmFileName -- PWM file name.
    genomeDict -- Genome dictionary.
    mpbsDict -- Dictionary of MPBSs to insert the results.
    scoringMethod -- Method to evaluate which MPBSs are enriched.
    tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting.
    pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1)
    bitscore -- The cutoff bitscore value. (default 12.0)
    fpr -- False positive rate to determine the cutoff value. (default 0.01)
    precision -- Motif score distribution precision. (default 10**4)
    highCutoff -- High cutoff for Boyle's rule. (default 0.7)
    functionalDepth -- Functional depth for Boyle's rule. (default 0.9)

    Returns:
    mpbsDict -- This method inserts entries on the mpbsDict.
    """

    # Reading PWM
    pwm = readPwmFile(pwmFileName, tempLocation, pseudocounts)
    pwmName = pwmFileName.split("/")[-1].split(".")[0]
    pwmLen = len(pwm)

    # Evaluating threshold
    pwmThreshold = 0.0
    if (scoringMethod == "bitscore"):
        pwmThreshold = bitscore
    elif (scoringMethod == "fpr"):
        sd = Motif.ScoreDistribution(pwm, precision=precision)
        pwmThreshold = sd.threshold_fpr(fpr)
    elif (scoringMethod == "boyle"):
        maxScore = pwm.max_score()
        minScore = 0.0  # TODO Boyle's rule is not suited for negative values.
        pwmThreshold = min(highCutoff * maxScore,
                           functionalDepth * (maxScore - minScore))
    else:
        sys.stderr.write("Choose a valid scoring method.\n")
        sys.exit(0)

    # Creating aditional parameters
    chrList = constants.getChromList(reference=[mpbsDict])
    tempMpbsDict = dict([(e, []) for e in chrList])
    maxValue = -99.0

    # Iterating on chromosomes
    for chrName in chrList:

        # Reading genome
        sequence = genomeDict[chrName]

        # Performing biopython's motif matching
        for pos, score in pwm.search_pwm(sequence, threshold=pwmThreshold):
            if (score > maxValue): maxValue = score
            if (pos >= 0):
                tempMpbsDict[chrName].append(
                    [pos, pos + pwmLen, pwmName, score, "+"])
            else:
                tempMpbsDict[chrName].append(
                    [-pos, -pos + pwmLen, pwmName, score, "-"])

    # Update scores - new scores are within [0,1000]
    for chrName in chrList:
        for e in tempMpbsDict[chrName]:
            mpbsDict[chrName].append([
                e[0], e[1], e[2],
                int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)),
                e[4]
            ])

    return 0
Example #37
0
def reverse(sequence):
    retSeq = []
    for c in sequence[::-1]:
        retSeq.append(revDict[c])
    return "".join(retSeq)


# Fetching sequences
bedName = bedFileName.split("/")[-1][:-4]
os.system("fastaFromBed -fi " + fastaFileName + " -fo " + outputLocation +
          bedName + ".txt" + " -bed " + bedFileName + " -tab")

# Reading pwm
pwmFile = open(pwmFileName, "r")
pwm = Motif.read(pwmFile, "jaspar-pfm")
motif = str(pwm.consensus()).upper()
pwmFile.close()

# Reading input vectors
misVec = []
misVecSpec = []
posVec = []
posVecSpec = []
scoreVec = []
bedFile = open(bedFileName, "r")
seqFile = open(outputLocation + bedName + ".txt", "r")
for bedLine in bedFile:

    # Reading line
    seqLine = seqFile.readline()
                mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
                yield name, mot
                yield name+'-R', mot.reverse_complement()
pwm_dict = {}
for num, (name, mot) in enumerate(yield_motifs()):
    if num % 100 == 0:
        print num
    pwm_dict[name] = mot
    
    
tmp = u"""A 0  0 6 1 0 0 0 4 2 2 0 0 3 
C  1 1 1 0 5 6 4 1 0 0 0 3 5 5 4 0  
G  0 6 0 1 1 0 0 0 0 7 1 1 0 0 1 0 
T  6 0 0 0 1 1 3 5 7 0 0 0 0 2 2 4"""

pwm_dict['coup2'] = Motif.read(StringIO(tmp), 'jaspar-pfm')
pwm_dict['coup2-R'] = Motif.read(StringIO(tmp), 'jaspar-pfm').reverse_complement()

# <codecell>

from Bio.Alphabet import IUPAC

def score_seq(seq, mot):
    bseq = Seq(seq, alphabet=IUPAC.unambiguous_dna)
    scores = mot.scanPWM(bseq)
    for pos, score in enumerate(scores.flatten(),1):
        if ~np.isnan(score):
            tseq = seq[pos:pos+len(mot)]
            yield pos, tseq, score
    
Example #39
0
 def test_sites_parsing(self):
     """Test to be sure that Motif can parse sites files.
     """
     motif= Motif.read(self.SITESin,"jaspar-sites")
     assert motif.length==6
Example #40
0
def motifMatchingBiopython(combinationList,pwmList,coordDict,pwmLocation,genomeList,tempLocation,fpr=0.01,pseudocounts=0.0,precision=10**4,color="black"):
    """Performs Biopython based motif matching and returns a list containing the matches and
       writes the results on bed files.

    Keyword arguments:
    combinationList -- List of the number of cobinding combinations.
    pwmList -- List of PWMs where each entry represents the name of a PWM file.
    coordDict -- Dictionary of coordinates where the motif matching will be applied.
    pwmLocation -- Path containing the motif pwm files.
    genomeList -- List of fasta files containing the sequences to perform the motif matching, where the headers are the chromosomes.
    tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting.
    fpr -- False positive rate to determine the cutoff value. (default 0.01)
    pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.0)
    precision -- Motif score distribution precision. (default 10**4)
    color -- Color of the bed entries. Can be 'green', 'red' or 'black'. (default 'black')

    Returns:
    mpbsDict -- Dictionary (for each PWM) of dictionaries (for each chromosome) of motif predicted binding sites.
    statDict -- Dictionary of statistics for Fisher test concerning the number of motifs inside enriched regions.
    geneDict -- Dictionary of genes (position NAME in bed file) that contains each motif.
    """
    
    # Reading PWM
    pwmDict = dict()
    for pwmName in pwmList: pwmDict[pwmName] = readPwmFile(pwmLocation+pwmName+".pwm","/".join(tempLocation.split("/")[:-1])+"/",pseudocounts)

    # Evaluating thresholds
    pwmThresholdDict = dict()
    for pwmName in pwmList:
        sd = Motif.ScoreDistribution(pwmDict[pwmName],precision=precision)
        pwmThresholdDict[pwmName] = sd.threshold_fpr(fpr)

    # Reading genome
    genomeDict = genome.readFastaFiles(genomeList)

    # Creating chromosome list
    chrList = constants.getChromList(reference=[coordDict])
    # Removing chrX, chrY and chrM
    # TODO Stop removing these chromosomes
    #chrListT = []
    #for e in chrList:
    #    if(e not in ["chrX", "chrY", "chrM"]): chrListT.append(e)
    #chrList = chrListT

    # Evaluating bed additionals
    if(color == "green"): color = "0,130,0"
    elif(color == "red"): color = "130,0,0"
    elif(color == "black"): color = "0,0,0"

    # Create combinations dictionary keys
    combKeys = []
    for c in combinationList:
        for b in [",".join(e) for e in itertools.combinations(pwmList,c)]: combKeys.append(b)

    # Iterating on chromosomes
    mpbsDict = dict([(e,dict()) for e in pwmDict.keys()])
    statDict = dict([(e,[0,0]) for e in combKeys]) # Left is evidence / Right is not evidence
    geneDict = dict([(e,[]) for e in combKeys])
    maxDict = dict([(e,-99.0) for e in pwmDict.keys()])
    ct=0
    for chrName in chrList:

        # Reading genome
        if(chrName not in genomeDict.keys()): continue
        sequence = genomeDict[chrName]

        # Iterating on coordinate dictionary
        for e in mpbsDict.keys(): mpbsDict[e][chrName] = []
        for coord in coordDict[chrName]:
            ct=ct+1
            #print "region", ct
            # Getting current sequence based on coordinates
            currSeq = sequence[coord[0]:coord[1]]

            # Keeping track of the factors found in this coordinate
            flagMotifs = dict([(e,False) for e in pwmDict.keys()])

            # Iterating on PWMs
            for pwmName in pwmDict.keys():
                pwmLen = len(pwmDict[pwmName])
                for pos, score in pwmDict[pwmName].search_pwm(currSeq,threshold=pwmThresholdDict[pwmName]):
                    if(score > maxDict[pwmName]): maxDict[pwmName] = score
                    if(pos >= 0): mpbsDict[pwmName][chrName].append([pos+coord[0],pos+coord[0]+pwmLen,pwmName,score,"+",pos+coord[0],pos+coord[0]+pwmLen,color])
                    else: mpbsDict[pwmName][chrName].append([-pos+coord[0],-pos+coord[0]+pwmLen,pwmName,score,"-",-pos+coord[0],-pos+coord[0]+pwmLen,color])
                    flagMotifs[pwmName] = True
            
            # Updating statistic counts and genes
            motifsFoundList = [k for k in pwmList if flagMotifs[k]]
            motifsFoundKeys = []
            motifsNotFoundKeys = [e for e in combKeys]
            for c in combinationList:
                for b in [",".join(e) for e in itertools.combinations(motifsFoundList,c)]:
                    motifsFoundKeys.append(b)
                    motifsNotFoundKeys.remove(b)
            for k in motifsFoundKeys:
                statDict[k][0] += 1
                for e in coord[2].split(":"): geneDict[k].append(e)
            for k in motifsNotFoundKeys:
                statDict[k][1] += 1

    # Update scores - new scores are within [0,1000]
    for pwmName in pwmDict.keys():
        for chrName in mpbsDict[pwmName].keys():
            for e in mpbsDict[pwmName][chrName]:
                e[3] = int(1000*(e[3]-pwmThresholdDict[pwmName])/(maxDict[pwmName]-pwmThresholdDict[pwmName]))

    # Remove repetitive genes from geneList
    for k in geneDict.keys(): geneDict[k] = list(set(geneDict[k]))
    
    return mpbsDict, statDict, geneDict
Example #41
0
# 1. <inputFileName>.png: The logo graphic.
#####################################################################################################################

import sys
import os
import math
from Bio import Motif

# Reading input
nucsPerImage = int(sys.argv[1])
inputFileLocation = sys.argv[2]
outputLocation = sys.argv[3]

# Reading pfm file
inputFile = open(inputFileLocation, "r")
pwm = Motif.read(inputFile, "jaspar-pfm")

# Writing whole or splited logo
if (nucsPerImage <= 0):
    pwm.weblogo(outputLocation +
                (inputFileLocation.split("/")[-1].split(".")[0]) + ".png",
                res=300)
else:
    tempPWM = [[], [], [], []]
    nucs = ["A", "C", "G", "T"]
    counter = 0
    fileCount = 0
    for i in range(0, len(pwm)):
        for j in range(0, len(nucs)):
            tempPWM[j].append(pwm.counts[nucs[j]][i])
        counter += 1
	def procurar(self,sequencia_string):
		'''
		Retorno = vetor de resultados
			    tamanho da sequencia	
		'''
		files = glob.glob( self.diretorio + "*.pfm")
		motivos_finais=[]
		nomes = open( self.diretorio +"matrix_list.txt", 'r').read()
		nomes_vetor = nomes.split('\n')
		
		#print nomes_vetor
		# print files
		#motif = Motif.read(open("PFMDir/1026_10858445.pfm"), "jaspar-pfm")
		# motif.make_instances_from_counts() 3
		lista_motivos = []
		lista_nomes = []
		motivos_checar_repetidos = {}
		for motivo in files:
		
		
			#print motivo
			isolar_nome = re.search(r'(\d*_\d*)\.pfm$', motivo)
			
			isolado = isolar_nome.group(1)
			
			
			for n_pesquisa in nomes_vetor:
				isolado_pesquisa = n_pesquisa.split('\t')
				id_encontrado = isolado_pesquisa[0]
				# print '<'+n_pesquisa+'>'
				if (id_encontrado == isolado):
					#print isolado
					nome_recolocar = n_pesquisa.split('\t')[2].split('_')[1]
						
					
					if nome_recolocar in motivos_checar_repetidos:
						motivos_checar_repetidos[nome_recolocar] += 1
						#print ('ja encontrado')
					else:
						#print motivo
						motivos_checar_repetidos[nome_recolocar] = 1
						lista_motivos.append(Motif.read(open(motivo), "jaspar-pfm"))
						lista_nomes.append(nome_recolocar)
		sequencia = Seq(sequencia_string.upper())
		#print lista_nomes
		#print(len(lista_motivos))
		# print motif.has_counts
		# print motif.counts
		contador = 0
		for converter in lista_motivos:
			#print contador
			#print converter.instances
			
			converter.make_instances_from_counts()
			contador += 1	
		
		# print motif.
		# motif.weblogo("teste.bmp")
		
		tamanho_seq=len(sequencia_string)
		tss=((tamanho_seq*self.pontas)/100)
		three_end= tamanho_seq-tss
		#print tss,three_end
		count_erro=0
		nome_contador=0
		saida_hmm=[]
		classificar=[]
		motif_Contador =1
		for procura in lista_motivos:
			#print  ("---------------->"  + str(motif_Contador) + "<---------------------------------------")
			#print (procura)
			motif_Contador += 1
			#for position_s,score_p in procura.search_pwm(sequencia,threshold=1):
				##print procura
				#print math.fabs(position_s),score_p,"teste"
				#print lista_nomes[nome_contador]
				#print '-----------------'
				#print count_errorandomseq.txt
				#count_erro=+ 1
			#print "----------------------------------------------"
			for pos, seq in procura.search_pwm(sequencia,threshold=10):
				#print teste
				#print "entrou"
				posicao=""
				if (pos<=tss):
					posicao="1"
				if (pos>=three_end)	:
					posicao="3"
				if (pos>tss and pos<three_end):
					posicao="2"
				motivos_finais.append((pos, seq ,lista_nomes[nome_contador],posicao))	
				#motivos_finais.append(str(pos) + "\t" + seq + "\t" +lista_nomes[nome_contador]+"\t"+posicao)
				saida_hmm.append((posicao,lista_nomes[nome_contador]))
				ordenar_teste=(posicao,lista_nomes[nome_contador])
				ordenar_teste=sorted(sorted(ordenar_teste))
				classificar.append(ordenar_teste)
				#print (str(pos) + "\t" + seq + "\t" +lista_nomes[nome_contador]+"\t"+ str(posicao))
				
			nome_contador+=1
		
		return (motivos_finais,tamanho_seq,lista_nomes,saida_hmm,classificar)
Example #43
0
 def test_meme_parsing(self):
     """Test to be sure that Motif can parse MEME output files.
     """
     parser = Motif.MEMEParser()
     record = parser.parse(self.MEMEin)
     assert len(record.motifs) == 1
Example #44
0
def make_seq(seq, comp = False):
    if comp:
        return Seq(seq,Alphabet.IUPAC.unambiguous_dna).reverse_complement()
    else:
        return Seq(seq,Alphabet.IUPAC.unambiguous_dna) 

def score_seq(mot, seq, comp = False):
    return mot.scanPWM(make_seq(seq, comp = comp))[0]


tmp = u"""A 1 2 0 0 0 2 0 0 1 2 
C 1 1 0 0 5 0 1 0 1 0
G 4 4 8 8 2 4 5 6 6 0
T 2 1 0 0 1 2 2 2 0 6"""

sp1_mot = Motif.read(StringIO(tmp), 'jaspar-pfm')

# <codecell>

test_seqs = [('sp3', 'GAGGCGTGGC'),
             ('sp2', 'TGGGCGGGAC'),
             ('sp1', 'GGGGAGTGGC')]
res = []
for name, base_seq in test_seqs:
    bs = list(base_seq)
    
    mat = np.zeros((6, len(bs)+2))
    for n in range(len(bs)):
        olet = bs[n]
        for ln, let in enumerate('ACTG'):
            bs[n] = let
Example #45
0
 def test_sites_parsing(self):
     """Test to be sure that Motif can parse sites files.
     """
     motif = Motif.Motif()
     motif.from_jaspar_sites(self.SITESin)
     assert motif.length == 6
Example #46
0
def fimoMM(pwmFileName,
           genomeFile,
           mpbsDict,
           scoringMethod,
           tempLocation,
           pseudocounts=0.1,
           bitscore=12.0,
           fpr=0.01,
           precision=10**4,
           highCutoff=0.7,
           functionalDepth=0.9,
           threshold=0.0001):
    """Performs FIMO motif matching algorithm and writes the results to a dictionary indexed by chromosome.

    Keyword arguments:
    pwmFileName -- PWM file name.
    genomeFile -- Fasta file containing the regions to be analyzed
    mpbsDict -- Dictionary of MPBSs to insert the results.
    scoringMethod -- Method to evaluate which MPBSs are enriched.
    tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting.
    pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1)
    bitscore -- The cutoff bitscore value. (default 12.0)
    fpr -- False positive rate to determine the cutoff value. (default 0.01)
    precision -- Motif score distribution precision. (default 10**4)
    highCutoff -- High cutoff for Boyle's rule. (default 0.7)
    functionalDepth -- Functional depth for Boyle's rule. (default 0.9)
    threshold -- The cutoff threshold value. (default 0.0001)

    Returns:
    mpbsDict -- This method inserts entries on the mpbsDict.
    """

    # Converting jaspar to MEME
    memeFileName = jasparToMeme(pwmFileName, tempLocation, pseudocounts)
    tempPath = "/".join(memeFileName.split("/")[:-1]) + "/"
    fimoFileName = tempPath + "results.txt"
    errorOutputName = tempPath + "error.txt"

    # Evaluating threshold
    pwmThreshold = 0.0
    if (scoringMethod == "bitscore"):
        pwmThreshold = bitscore
        threshold = 0.1
    elif (scoringMethod == "fpr"):
        bioPwm = biopythonMM.readPwmFile(pwmFileName, tempLocation,
                                         pseudocounts)
        sd = Motif.ScoreDistribution(bioPwm, precision=precision)
        pwmThreshold = sd.threshold_fpr(fpr)
        threshold = 0.1
        print bioPwm.max_score()
    elif (scoringMethod == "boyle"):
        maxScore = 0.0
        minScore = 0.0  # TODO Boyle's rule is not suited for negative values.
        pwmBoyle = bitScoreMM.createPwmDict(pwmFileName, pseudocounts)
        pwmLen = len(pwmBoyle["A"])
        for i in range(0, pwmLen):
            maxScore += max(pwmBoyle["A"][i], pwmBoyle["C"][i],
                            pwmBoyle["G"][i], pwmBoyle["T"][i])
        background = math.log(0.25, 2) * pwmLen
        maxScore -= background
        pwmThreshold = min(highCutoff * maxScore,
                           functionalDepth * (maxScore - minScore))
        threshold = 0.1
    elif (scoringMethod == "fimo"):
        pass
    else:
        sys.stderr.write("Choose a valid scoring method.\n")
        sys.exit(0)

    # Performing FIMO
    os.system(
        "fimo --text --verbosity 1 --max-stored-scores 1000000 --output-pthresh "
        + str(threshold) + " " + memeFileName + " " + genomeFile + " > " +
        fimoFileName + " 2> " + errorOutputName)

    # Reading FIMO output
    tempMpbsDict = dict()
    fimoFile = open(fimoFileName, "r")
    fimoFile.readline()
    maxValue = -999
    for line in fimoFile:
        ll = line.strip().split("\t")
        ll = [ll[0][0], ll[0][1:]] + ll[1:]
        if (scoringMethod != "fimo" and float(ll[5]) < pwmThreshold): continue
        if (float(ll[5]) > maxValue): maxValue = float(ll[5])
        if (ll[2] in tempMpbsDict.keys()):
            if (ll[0] == "+"):
                tempMpbsDict[ll[2]].append(
                    [int(ll[3]) - 1,
                     int(ll[4]), ll[1],
                     float(ll[5]), ll[0]])
            else:
                tempMpbsDict[ll[2]].append(
                    [int(ll[4]) - 1,
                     int(ll[3]), ll[1],
                     float(ll[5]), ll[0]])
        else:
            if (ll[0] == "+"):
                tempMpbsDict[ll[2]] = [[
                    int(ll[3]) - 1,
                    int(ll[4]), ll[1],
                    float(ll[5]), ll[0]
                ]]
            else:
                tempMpbsDict[ll[2]] = [[
                    int(ll[4]) - 1,
                    int(ll[3]), ll[1],
                    float(ll[5]), ll[0]
                ]]
    fimoFile.close()

    # Update scores and remove MPBSs with score below pwmThreshold (if it is being used)
    for chrName in tempMpbsDict.keys():
        for e in tempMpbsDict[chrName]:
            if (chrName in mpbsDict.keys()):
                mpbsDict[chrName].append([
                    e[0], e[1], e[2],
                    int(1000 * (e[3] - pwmThreshold) /
                        (maxValue - pwmThreshold)), e[4]
                ])
            else:
                mpbsDict[chrName] = [[
                    e[0], e[1], e[2],
                    int(1000 * (e[3] - pwmThreshold) /
                        (maxValue - pwmThreshold)), e[4]
                ]]

    # Removing temporary PWM folder
    os.system("rm -rf " + "/".join(memeFileName.split("/")[:-1]))

    return 0
Example #47
0
#!/usr/bin/env python
# counts a motif (arg1) with overlaps in a fasta file (arg2)
import sys
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import Motif
from Bio.Alphabet import IUPAC
theMotif = sys.argv[1]
fastafile = sys.argv[2]

momo=Motif.Motif(alphabet=IUPAC.unambiguous_dna)
momo.add_instance(Seq(theMotif,momo.alphabet))

momoc=0

handle = open(fastafile)

def countMotif(myseqrecord, mymotif):
	i=0
	for pos in mymotif.search_instances(myseqrecord.seq):
		i+=1
	return i
			
for seq_record in SeqIO.parse(handle, "fasta"):
	momoc=momoc + countMotif(seq_record,momo)
handle.close()

print "motif",theMotif, "found", momoc, "times in the", fastafile, "file"


Example #48
0
from Bio import Motif
from scipy.stats import fisher_exact

def search_motif(mf,seq):
	"""search pwm for each motif in the motiflist form sequence"""
	cutoff = 0.8*mf.max_score()
	result = [(score,pos) for pos,score in mf.search_pwm(seq)]
	if not result:
		return None
	scores = [item[0] for item in result]
	pos = [item[1] for item in result]
	if max(scores) > cutoff:
		return pos[scores.index(max(scores))]
	else: 
		return None

count = 0
mf = Motif.read(open(sys.argv[1]),'jaspar-pfm')
ofp = open(sys.argv[2]+'.motif.fa','w')
for i,record in enumerate(SeqIO.parse(sys.argv[2],'fasta')):
	hit = search_motif(mf,record.seq) 
	if hit == None:
		continue
	else:
		record.id = record.id+'_'+str(hit)
		print >> ofp, '>'+record.id
		print >> ofp, record.seq
		count += 1
print str(i+1)+'\ttotal seq(s)'
print str(count)+'\tcontains motifs ('+str(count/float(i+1)*100)+'%)'