Example #1
0
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True):
    for fastaf in fastafile:
        thisseqname = fastaf.split('/')[-1].split('.')[0]
        handle = open(fastaf, "r")
        records = list(Bio.SeqIO.parse(handle, "fasta"))
        handle.close()
        thisseq = records[0].seq
        print 'Doing sequence ', thisseqname, 'length=', len(thisseq)
        for pwmf in pwmfiles:
            thispwmname = pwmf.split('/')[-1]
            print ' Doing pwm ', thispwmname
            thispwm = MOODS.load_matrix(pwmf)
            thispwmcomplement = MOODS.reverse_complement(thispwm)
            
            print '  strand 1'
            onestrandindexvector=getMOODSscore(thisseq, thispwm)
            print '  strand 2'
            otherstrandindexvecor=getMOODSscore(thisseq, thispwmcomplement)
            print '  finding best score per bp'
            bothstrandsindexvector = np.append( onestrandindexvector, otherstrandindexvecor, axis=0)
            bestscorevector = getMaxPWMScore( bothstrandsindexvector, len(thispwm[0]))
            
            for strandnbr in range(len(bothstrandsindexvector)):
                print '  writing wiggle for strand', str(strandnbr)
                vegardswritewiggle(bothstrandsindexvector[strandnbr,:], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname+ '/strand_'+str(strandnbr))
            
            print '  writing wiggle for bestscore'
            vegardswritewiggle(bestscorevector, name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
def getMOODSscore(seqfile, pwmfiles, both_strands=False):
    handle = open(seqfile, "r")
    records = list(Bio.SeqIO.parse(handle, "fasta"))
    handle.close()
    seq = records[0].seq
    print 'len(seq)=', len(seq)
    matrixlist = list()
    for f in pwmfiles:
        matrix = MOODS.load_matrix(f)
        print 'pwm ', f, 'windowlength=', len(matrix[0])
        matrixlist.append(matrix)
        if both_strands:
            matrixlist.append(
                MOODS.reverse_complement(matrix)
            )  # both_strand option in MOODS returned a akward result.
    print 'starting MOODS.search', datetime.now()
    results = MOODS.search(seq,
                           matrixlist,
                           thresholds=1,
                           absolute_threshold=False)
    print 'done MOODS.search', datetime.now()
    reslist = []
    for n in range(len(pwmfiles)):
        thisind = n * (1 + both_strands)

        reslist.append(vegardparseMOODSres(results[thisind], len(seq)))
        if both_strands:
            reslist[n] = np.append(reslist[n],
                                   vegardparseMOODSres(results[thisind + 1],
                                                       len(seq)),
                                   axis=0)
    return (reslist)
Example #3
0
def MOODS_search(seq, motif, thresholds=0):
    """an equivalent to Motif.search_pwm()"""
    if not USE_MOODS:
        raise RuntimeError("MOODS could not be imported")
    sequence = seq
    matrix_ = MOODS.transpose([map(lambda x: x[1], sorted(x.items())) for x in motif.log_odds()])
    # Note: algorithm = 'lf' fails due to segmentation fault
    results_per_matrix = MOODS.search(
        sequence,
        [matrix_],
        thresholds,
        bg=None,
        algorithm="pla",
        q=7,
        absolute_threshold=True,
        both_strands=True,
        combine=True,
    )

    # format as Motif.search_pwm results
    search_results = results_per_matrix[0]
    # figure out direction of reverse results
    # do we need to reverse it?
    results_sorted_like_Bio_Motif = sorted(
        search_results, key=operator.itemgetter(0), cmp=lambda x, y: cmp(abs(x), abs(y))
    )
    return results_sorted_like_Bio_Motif
Example #4
0
def getPWMscores(regions, PWM, fasta, regionsFile):
    """
    Score every basepairof a set of regions with a match to a PWM.

    :param regions: `bedtools.BedTool` with regions to look at.
    :type regions: bedtools.BedTool
    :param PWM:
    :type PWM:
    :param fasta: Fasta file with genome sequence.
    :type fasta: str
    :param regionsFile:
    :type regionsFile:
    """
    import MOODS
    import numpy as np
    import pandas as pd

    # Get nucleotides
    seq = regions.sequence(s=True, fi=fasta)
    with open(seq.seqfn) as handle:
        seqs = handle.read().split("\n")[1::2]  # get fasta sequences

    # Match strings with PWM
    scores = list()
    for sequence in seqs:
        result = MOODS.search(sequence, [PWM], 30)
        scores.append(np.array([j for i, j in result[0]]))

    names = open(regionsFile).read().split("\n")
    names.pop(-1)
    names = [line.split("\t")[3] for line in names]

    return pd.DataFrame(scores, index=names, columns=range(-990, 990))
Example #5
0
def getPWMscores(regions, PWM, fasta, regionsFile):
    """
    Score every basepairof a set of regions with a match to a PWM.

    :param regions: `bedtools.BedTool` with regions to look at.
    :type regions: bedtools.BedTool
    :param PWM:
    :type PWM:
    :param fasta: Fasta file with genome sequence.
    :type fasta: str
    :param regionsFile:
    :type regionsFile:
    """
    import MOODS
    import numpy as np
    import pandas as pd

    # Get nucleotides
    seq = regions.sequence(s=True, fi=fasta)
    with open(seq.seqfn) as handle:
        seqs = handle.read().split("\n")[1::2]  # get fasta sequences

    # Match strings with PWM
    scores = list()
    for sequence in seqs:
        result = MOODS.search(sequence, [PWM], 30)
        scores.append(np.array([j for i, j in result[0]]))

    names = open(regionsFile).read().split("\n")
    names.pop(-1)
    names = [line.split("\t")[3] for line in names]

    return pd.DataFrame(scores, index=names, columns=range(-990, 990))
Example #6
0
def getMOODSscore_old(seq, mat):
    results = MOODS.search(seq, [mat], thresholds=1, absolute_threshold=False)
    resarray = np.zeros((1, len(seq)), dtype=np.dtype('Float32'))
    resarray[:, :] = -100
    for (position, score) in results[0]:
        resarray[0, position] = score
    return (resarray)
Example #7
0
def getMOODSscore_old(seq, mat):
    results = MOODS.search(seq, [mat], thresholds=1, absolute_threshold=False)
    resarray = np.zeros( (1, len(seq)), dtype=np.dtype('Float32'))
    resarray[:,:] = -100
    for (position, score) in results[0]:
        resarray[0, position] = score
    return(resarray)
Example #8
0
def search(consensus_list,TF,master,search_region,p):
	count = 0
	count1 = 0
	
	interaction_only = False
	try:
		if sys.argv[3] == "True":
			interaction_only = True
		else:
			interaction_only = False
	except IndexError:
		pass
	duplicate = []
	
	threshold = []
	header = ''
	for i in master:
		count +=1
		threshold += [p]
		print >> sys.stderr, count

	#print threshold
	
	for region in search_region:
		if 'strand' in region:
			continue
		if '>' in region:
			header = region
		else:
			for i in range(len(master)):
	
				tf = TF[i]
				#print tf,consensus_list[i]
				tf_length = len(consensus_list[i])
				#result = MOODS.search(region,master,threshold,absolute_threshold=threshold)
				result = MOODS.search(region,master,p)
				for check in range(len(result)):
					for j in range(len(result[check])):
						position = result[check][j][0]
						tf_length = len(consensus_list[check])
						
						if result[check][j] == []:
							continue
						if interaction_only:
							if [TF[check],header] not in duplicate:
								duplicate += [[TF[check],header]]
								print tf.strip(),header.strip()[1:]
								print ''
						else:
							if [TF[check],header,position] not in duplicate:
								duplicate+= [[TF[check],header,position]]
								print TF[check].strip(),header.strip()[1:]
								print 'position:',position
								align(consensus_list[check],region[position:position+tf_length])
								#print consensus_list[i],'Matched the motif in the upstream region:',region[position:position+tf_length]
								print ''
Example #9
0
def ProcessSeqs(SEQ_HANDLE, PWMS, THRESHOLD, WANT_REV=False, bg=None):
    """Yields matches on sequences in an 'interval' formatted dictionary"""

    pwm_names = map(lambda x: x[0], PWMS)
    pwm_mats = map(lambda x: x[1], PWMS)
    thresh = map(lambda x: MOODS.threshold_from_p(x, bg, THRESHOLD), pwm_mats)

    for interval in ReadInterval(SEQ_HANDLE):
        print interval['NAME']

        results = MOODS.search(interval['SEQ'].upper(),
                               pwm_mats,
                               thresh,
                               both_strands=WANT_REV,
                               algorithm='lf',
                               absolute_threshold=True,
                               bg=bg)

        for res, pwm_name, pwm_mat, th in zip(results, pwm_names, pwm_mats,
                                              thresh):
            width = len(pwm_mat[0])
            for position, score in res:
                if score > th:
                    yield {
                        'NAME': interval['NAME'],
                        'START': int(interval['START']) + position,
                        'END': int(interval['START']) + width + position,
                        'STRAND': interval['STRAND'],
                        'PWM': pwm_name,
                        'SCORE': score,
                        'CHROM': interval['CHROM'],
                        'SEQ':
                        interval['SEQ'][position:(position + width)].upper()
                    }
                else:
                    print 'got bad result'
Example #10
0
def getMOODSscore(seqfile, pwmfiles, both_strands=False):
    handle = open(seqfile, "r")
    records = list(Bio.SeqIO.parse(handle, "fasta"))
    handle.close()
    seq = records[0].seq
    print 'len(seq)=',len(seq)
    matrixlist=list()
    for f in pwmfiles:
        matrix = MOODS.load_matrix(f)
        print 'pwm ', f , 'windowlength=', len(matrix[0])
        matrixlist.append(matrix)
        if both_strands:
            matrixlist.append(MOODS.reverse_complement(matrix)) # both_strand option in MOODS returned a akward result.
    print 'starting MOODS.search', datetime.now()
    results = MOODS.search(seq, matrixlist, thresholds=1, absolute_threshold=False)
    print 'done MOODS.search', datetime.now()
    reslist=[]
    for n in range(len(pwmfiles)):
        thisind = n * (1 + both_strands)
        
        reslist.append(vegardparseMOODSres( results[thisind] , len(seq)))
        if both_strands:
            reslist[n] = np.append( reslist[n] , vegardparseMOODSres( results[thisind+1] , len(seq)), axis=0)
    return(reslist)
Example #11
0
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True):
    for fastaf in fastafile:
        ### seqence only needed for length here. MOODS does this parsing again later but without reporting length.
        thisseqname = fastaf.split('/')[-1].split('.')[0]
        handle = open(fastaf, "r")
        records = list(Bio.SeqIO.parse(handle, "fasta"))
        handle.close()
        thisseq = records[0].seq
        print 'Doing sequence ', thisseqname, 'length=', len(thisseq)
        
        for pwmf in pwmfiles:
            thispwmname = pwmf.split('/')[-1]
            thispwm = MOODS.load_matrix(pwmf)
            print ' Doing MOODS both strands for pwm ', thispwmname, ', length=', len(thispwm[0]), datetime.now()
            onestrandsindexvector = getMOODSscore(fastaf, pwmf, len(thisseq))
            print '  bp with no score (given ', NO_SCORE_VALUE,  ') is ', (onestrandsindexvector == NO_SCORE_VALUE).sum(), ' expected ', (len(thispwm[0])-1)
            print '  finding best score per bp, ', datetime.now()
            bestscorevector = getMaxPWMScore( onestrandsindexvector, len(thispwm[0])), 
            
            print '  writing wiggle for score per start index.', datetime.now()
            vegardswritewiggle(onestrandsindexvector[0,], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname)
            
            print '  writing wiggle for bestscore. ', datetime.now()
            vegardswritewiggle(bestscorevector[0], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
Example #12
0
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True):
    for fastaf in fastafile:
        ### seqence only needed for length here. MOODS does this parsing again later but without reporting length.
        thisseqname = fastaf.split('/')[-1].split('.')[0]
        handle = open(fastaf, "r")
        records = list(Bio.SeqIO.parse(handle, "fasta"))
        handle.close()
        thisseq = records[0].seq
        print 'Doing sequence ', thisseqname, 'length=', len(thisseq)

        for pwmf in pwmfiles:
            thispwmname = pwmf.split('/')[-1]
            thispwm = MOODS.load_matrix(pwmf)
            print ' Doing MOODS both strands for pwm ', thispwmname, ', length=', len(
                thispwm[0]), datetime.now()
            onestrandsindexvector = getMOODSscore(fastaf, pwmf, len(thisseq))
            print '  bp with no score (given ', NO_SCORE_VALUE, ') is ', (
                onestrandsindexvector == NO_SCORE_VALUE).sum(), ' expected ', (
                    len(thispwm[0]) - 1)
            print '  finding best score per bp, ', datetime.now()
            bestscorevector = getMaxPWMScore(onestrandsindexvector,
                                             len(thispwm[0])),

            print '  writing wiggle for score per start index.', datetime.now()
            vegardswritewiggle(onestrandsindexvector[0, ],
                               name=thispwmname,
                               chr=thisseqname,
                               destpath=destdir + '/' + 'start_index_score/' +
                               thispwmname)

            print '  writing wiggle for bestscore. ', datetime.now()
            vegardswritewiggle(bestscorevector[0],
                               name=thispwmname,
                               chr=thisseqname,
                               destpath=destdir + '/' +
                               'best_score_in_window/' + thispwmname)
Example #13
0
def match_single(motif,
                 sequence,
                 genomic_region,
                 unique_threshold=None,
                 normalize_bitscore=True,
                 sort=False):
    """
    Performs motif matching given sequence and the motif.pssm passed as parameter.
    The genomic_region is needed to evaluate the correct binding position.
    Please note that the arguments should be passed as a list, to allow for parallelization
    mapping function.

    Keyword arguments:
    motif -- TODO.
    sequence -- A DNA sequence (string).
    genomic_region -- A GenomicRegion.
    output_file -- TODO.  
    unique_threshold -- If this argument is provided, the motif search will be made using a threshold of 0 and
                        then accepting only the motif matches with bitscore/motif_length >= unique_threshold.
        
    Return:
    Print MPBSs to output_file.
    """

    # Establishing threshold
    if unique_threshold:
        current_threshold = 0.0
        eval_threshold = unique_threshold
        motif_max = motif.max / motif.len
    else:
        current_threshold = motif.threshold
        eval_threshold = motif.threshold
        motif_max = motif.max

    # Performing motif matching
    try:
        # old MOODS version
        results = MOODS.search(sequence, [motif.pssm_list],
                               current_threshold,
                               absolute_threshold=True,
                               both_strands=True)
    except:
        # TODO: we can expand this to use bg from sequence, for example,
        # or from organism.
        bg = MOODS.tools.flat_bg(4)
        results = MOODS.scan.scan_dna(sequence, [motif.pssm_list], bg,
                                      [current_threshold], 7)

    grs = GenomicRegionSet("mpbs")

    for search_result in results:
        for r in search_result:
            try:
                position = r.pos
                score = r.score
            except:
                (position, score) = r

            # Verifying unique threshold acceptance
            if unique_threshold and score / motif.len < unique_threshold:
                continue

            # If match forward strand
            if position >= 0:
                p1 = genomic_region.initial + position
                strand = "+"
            # If match reverse strand
            elif not motif.is_palindrome:
                p1 = genomic_region.initial - position
                strand = "-"
            else:
                continue

            # Evaluating p2
            p2 = p1 + motif.len

            # Evaluating score (integer between 0 and 1000 -- needed for bigbed transformation)
            if normalize_bitscore:
                # Normalized bitscore = standardize to integer between 0 and 1000 (needed for bigbed transformation)
                if motif_max > eval_threshold:
                    norm_score = int(((score - eval_threshold) * 1000.0) /
                                     (motif_max - eval_threshold))
                else:
                    norm_score = 1000
            else:
                # Keep the original bitscore
                if unique_threshold:
                    norm_score = score / motif.len
                else:
                    norm_score = score

            grs.add(
                GenomicRegion(genomic_region.chrom,
                              int(p1),
                              int(p2),
                              name=motif.name,
                              orientation=strand,
                              data=str(norm_score)))

    if sort:
        grs.sort()

    return grs
Example #14
0
####### running MOODS algorithm on sequence with all pwm files.
#datetime.now()
print 'running getMOODSscore', datetime.now()
indexscorematrix = getMOODSscore(fastafile, pwmfiles, both_strands=calculate_both_strands)
print 'finished getMOODSscore', datetime.now()
#datetime.now()

## for alle pwm.
## lage max array
## skrive ut 3 filer.
for n in range(len(pwmfiles)):
    thisname = pwmfiles[n].split('/')[-1]
    print 'making maxscpre for ',thisname, datetime.now()
    thisscorematrix = indexscorematrix[n] #
    thispwmlength = len(MOODS.load_matrix(pwmfiles[n])[0])
    thismaxvector = getMaxPWMScore(thisscorematrix, thispwmlength)
    
    print 'writing wiggle for ',thisname, datetime.now()
    ### best score file
    vegardswritewiggle(thismaxvector, name=thisname, chr=seqname, path=outputdir + '/' + 'best_score_in_window/'+thisname)
    for strandnbr in range(len(thisscorematrix)):
        vegardswritewiggle(thisscorematrix[strandnbr,:], name=thisname, chr=seqname, path=outputdir + '/' + 'start_index_score/'+ thisname+ '/strand_'+str(strandnbr))
    



temp1 = getMaxPWMScore(temp1, thispwmlength)

    
#datetime.now()
print 'running getMOODSscore', datetime.now()
indexscorematrix = getMOODSscore(fastafile,
                                 pwmfiles,
                                 both_strands=calculate_both_strands)
print 'finished getMOODSscore', datetime.now()
#datetime.now()

## for alle pwm.
## lage max array
## skrive ut 3 filer.
for n in range(len(pwmfiles)):
    thisname = pwmfiles[n].split('/')[-1]
    print 'making maxscpre for ', thisname, datetime.now()
    thisscorematrix = indexscorematrix[n]  #
    thispwmlength = len(MOODS.load_matrix(pwmfiles[n])[0])
    thismaxvector = getMaxPWMScore(thisscorematrix, thispwmlength)

    print 'writing wiggle for ', thisname, datetime.now()
    ### best score file
    vegardswritewiggle(thismaxvector,
                       name=thisname,
                       chr=seqname,
                       path=outputdir + '/' + 'best_score_in_window/' +
                       thisname)
    for strandnbr in range(len(thisscorematrix)):
        vegardswritewiggle(thisscorematrix[strandnbr, :],
                           name=thisname,
                           chr=seqname,
                           path=outputdir + '/' + 'start_index_score/' +
                           thisname + '/strand_' + str(strandnbr))
Example #16
0
def ProcessCLI(args):
    
    outputDirectory = '/N/u/jubudka/Mason/BindingFiles/'
    weightMatrixDirectory = '/N/u/jubudka/Mason/PWMsmall/'
    sequencesFileName = 'FASTA_All_Merged_Encode.fasta'
    p_val = 0.0001

    print args
    for i in xrange(len(args)):
        if args[i] == "-f":
	    sequencesFileName = args[i+1]
	    print "Fasta file is: ", sequencesFileName
	elif args[i] == "-p":
	    weightMatrixDirectory = args[i+1]
	    print "PWM file is: ", weightMatrixDirectory
	elif args[i] == "-o":
	    outputDirectory = args[i+1]
	    print "Output file is: ", outputDirectory
	elif args[i] == "-t":
	    p_val = float(args[i+1])

    if not os.path.exists(outputDirectory):
	os.makedirs(outputDirectory)
	

    # file for saving average score stuff
    # load position weight matrices
    # order is A C G T
    sequences = {}
    seqIDs = []
    current_sequence = ''
    sequencesFile = open(sequencesFileName)

    aCount = 0
    cCount = 0
    gCount = 0
    tCount = 0
    totalLength = 0

    for lines in sequencesFile:
	line = lines.strip()
	if line == '':
	    continue
	if (line[0].startswith('>')):
	    seqIDs.append(line[1:])
	    #add previous sequence to dictionary
	    #create the reverse complement and add to dictionary
	    #perform nucleotide counting
	    #reset sequence to '' for next fasta sequence
	    if (len(current_sequence) > 0):
		upper_current_sequence = current_sequence.upper()
		seqID = seqIDs.pop(0)
		sequences[seqID + ' ' + 'p'] = upper_current_sequence
		reverseSequence = reverse_complement(upper_current_sequence)
		sequences[seqID + ' ' + 'm'] = reverseSequence
		aCount = aCount + upper_current_sequence.count('A')
                cCount = cCount + upper_current_sequence.count('C')
                gCount = gCount + upper_current_sequence.count('G')
                tCount = tCount + upper_current_sequence.count('T')
                totalLength = totalLength + len(current_sequence)

	    current_sequence = ''
	else:
	    current_sequence += line

    upper_current_sequence = current_sequence.upper()
    seqID = seqIDs.pop(0)
    sequences[seqID + ' ' + 'p'] = upper_current_sequence
    reverseSequence = reverse_complement(upper_current_sequence)
    sequences[seqID + ' ' + 'm'] = reverseSequence

    aCount = aCount + upper_current_sequence.count('A')
    cCount = cCount + upper_current_sequence.count('C')
    gCount = gCount + upper_current_sequence.count('G')
    tCount = tCount + upper_current_sequence.count('T')
    totalLength = totalLength + len(current_sequence)

    aContent = aCount/float(totalLength)
    cContent = cCount/float(totalLength)
    gContent = gCount/float(totalLength)
    tContent = tCount/float(totalLength)
  
    backgroundScores = {'A':aContent, 'C':cContent, 'G':gContent, 'T':tContent}
    bg = [backgroundScores['A'], backgroundScores['C'], backgroundScores['G'], backgroundScores['T']]
    print bg

    matrix_names = [filename for filename in os.listdir(weightMatrixDirectory) if filename[-4:] == '.pfm']
    pseudocount = 1

    matrices = [MOODS.load_matrix(weightMatrixDirectory + filename) for filename in matrix_names]

    matrices = [MOODS.count_log_odds(matrix, bg, pseudocount) for matrix in matrices]

    thresholds = [MOODS.threshold_from_p(matrix, bg, p_val) for matrix in matrices]


    for (matrix, matrix_name, threshold) in zip(matrices, matrix_names, thresholds):
	    motifLength = len(matrix[0])
	    if motifLength >= 18:
		matrix_mapper_long(matrix, matrix_name, threshold, outputDirectory, sequences)
		continue
	    else:
		matrix_mapper(matrix, matrix_name, threshold, outputDirectory, sequences)

    print "Finished"		
Example #17
0
import MOODS

matrix = [ [10,0,0],
           [0,10,0],
           [0,0,10],
           [10,10,10]]

results = MOODS.search('actgtggcgtcaacgtaggccaacgtggacccgtacgtaaacgaagaggggtagtc', [matrix], 30, absolute_threshold=30)

for i in results:
    for (position, score) in i:
        print("Position: " + str(position) + " Score: "+ str(score))
Example #18
0
import MOODS

matrix = [[10, 0, 0], [0, 10, 0], [0, 0, 10], [10, 10, 10]]

results = MOODS.search(
    'actgtggcgtcaacgtaggccaacgtggacccgtacgtaaacgaagaggggtagtc', [matrix],
    30,
    absolute_threshold=30)

for i in results:
    for (position, score) in i:
        print("Position: " + str(position) + " Score: " + str(score))
Example #19
0
def main():
    p = optparse.OptionParser(__doc__)
    p.add_option('-t',
                 '--thresh',
                 action='store',
                 dest='threshold',
                 default=0.0,
                 help='determines threshold')
    p.add_option('-a',
                 '--append',
                 action='store',
                 dest='name',
                 default='resultsfor',
                 help='appends pwm name to this when\
                 creating files')
    p.add_option('-A',
                 '--absolute',
                 action='store_true',
                 dest='A',
                 default=False,
                 help='absolute threshold')
    p.add_option('-s',
                 '--standard_background',
                 action='store_true',
                 dest='stdbg')
    p.add_option('-M', '--specific_Matrix', action='store', dest='specific')
    options, args = p.parse_args()

    pwm = open(args[0], 'rU')
    fa = open(args[1], 'rU')
    pfa = list(Bio.SeqIO.parse(fa, 'fasta'))
    index, matricies, sizes = pySeq.parsing.PWMparser.parse(pwm)

    underorequal20 = []
    over20 = []
    under20names = []
    over20names = []
    pwmdata = {}
    fileout = {}
    bgt = False
    if options.stdbg:
        bgt = [0.25, 0.25, 0.25, 0.25]

    # Construct Matrices to search and files to write to.
    for k in index.keys():
        if options.specific:
            if k == options.specific:
                filename = options.name + k + '.bed'
                fileout[k] = open(filename, 'w')
                if sizes[k] <= 20:
                    underorequal20.append(matricies[k])
                    under20names.append(k)
                else:
                    over20.append(matricies[k])
                    over20names.append(k)
            else:
                filename = options.name + k + '.bed'
                fileout[k] = open(filename, 'w')
                if sizes[k] <= 20:
                    underorequal20.append(matricies[k])
                    under20names.append(k)
                else:
                    over20.append(matricies[k])
                    over20names.append(k)

    for chrom in pfa:
        print(chrom.name)
        #Run under 20s
        # Should we sort the results as all downstream applications require a
        # sort first
        res = MOODS.search(chrom.seq,
                           underorequal20,
                           float(options.threshold),
                           absolute_threshold=options.A,
                           both_strands=True,
                           bg=bgt,
                           algorithm='lf')

        for n, r in enumerate(res):
            for position, score in r:
                start, end, strand = strand_adjust(position,
                                                   sizes[under20names[n]])
                # Add option to round the score values.  Defaulting to int atm
                # since bedToBigBed only accepts integer values....
                fileout[under20names[n]].write('\t'.join([
                    chrom.name,
                    str(start),
                    str(end), under20names[n],
                    str(int(score * 100)), strand, '\n'
                ]))

        #Run over 20s
        res = MOODS.search(chrom.seq,
                           over20,
                           float(options.threshold),
                           absolute_threshold=options.A,
                           both_strands=True,
                           bg=bgt,
                           algorithm='supera')

        for n, r in enumerate(res):
            for position, score in r:
                start, end, strand = strand_adjust(position,
                                                   sizes[over20names[n]])
                fileout[over20names[n]].write('\t'.join([
                    chrom.name,
                    str(start),
                    str(end), over20names[n],
                    str(int(score * 100)), strand, '\n'
                ]))
Example #20
0
records = fasta.parseFasta(fasta_filepath)

seq = records[0][1]

matrix1 = [     [0,1,0,0,0,0,0,1,1,0],
                [1,0,0,0,0,0,0,0,0,0],
                [0,0,0,0,0,0,0,0,0,0],
                [0,0,1,1,1,1,1,0,0,1]
            ]
matrix2 = [     [10,0,10,3,5,5],
                [0,5,0,3,5,0,5],
                [0,1,0,3,0,5,0],
                [0,4,0,1,0,0,5]
            ]

results = MOODS.search(seq, [matrix1, matrix2], 0.011)

print("Matrix 1 results: "+ str(len(results[0])))
print("Matrix 2 results: "+ str(len(results[1])))


matrices = [matrix1, matrix2]
thresholds = [0.011, 0.011]
bg = MOODS.bg_from_sequence(seq, 0.1)
q = 7
absolute_threshold = False
both_strands=False
ms = MOODS.MOODSSearch(matrices, thresholds, bg, q, absolute_threshold, both_strands)
results = ms.search(seq)

print("New Matrix 1 results: "+ str(len(results[0])))
Example #21
0
def find_motif_disruptions(
    position, 
    ref, 
    alt, 
    genome_fasta, 
    matrices,
):
    """
    Determine whether there is a difference between the ref and alt
    alleles for TF binding. Requires samtools in your path.
    
    Parameters
    ----------
    position : str
        Zero based genomic coordinates of the reference allele of the form
        chrom:start-end (chr5:100-101 for a SNV for instance). The value end -
        start should equal the length of the ref allele.

    ref : str
        Reference allele. This should match the reference sequence at "position"
        in genome_fasta.

    alt : str
        Alternate allele.

    genome_fasta : str
        Path to genome fasta file. This file should be indexed.
    
    matrices : dict
        Dict whose keys are motif names and whose values are pandas data frames 
        or numpy arrays containing PWMs with columns ACGT.

    Returns
    -------
    out : pandas.DataFrame
        Pandas data frame with motifs whose best matches that overlapped the
        variant differed between the reference and alternate sequences. A score
        of zero and a strand of '' indicates that there was not a match for the
        motif on the given allele.

    """
    import subprocess
    import MOODS
    # import pybedtools as pbt
    max_motif_length = max([x.shape[0] for x in matrices.values()])
    chrom, coords = position.split(':')
    start,end = [int(x) for x in coords.split('-')]
    s = '{}:{}-{}'.format(chrom, start - max_motif_length + 1, end +
                          max_motif_length - 1)
    c = 'samtools faidx {} {}'.format(genome_fasta, s)
    seq_lines = subprocess.check_output(c, shell=True).strip().split()
    ref_seq = seq_lines[1]
    alt_seq = ref_seq[0:max_motif_length - 1] + alt + ref_seq[max_motif_length + len(ref) - 1:]

    ref_variant_start = max_motif_length - 1
    ref_variant_end = max_motif_length - 1 + len(ref)
    alt_variant_start = max_motif_length - 1
    alt_variant_end = max_motif_length - 1 + len(alt)

    ms = [matrices[x].T.values.tolist() for x in matrices.keys()]
    ref_res = MOODS.search(ref_seq, ms, 0.001, both_strands=True, 
                           bg=[0.25, 0.25, 0.25, 0.25])
    ref_res = dict(zip(matrices.keys(), ref_res))
    alt_res = MOODS.search(alt_seq, ms, 0.001, both_strands=True, 
                           bg=[0.25, 0.25, 0.25, 0.25])
    alt_res = dict(zip(matrices.keys(), alt_res))

    # First we'll remove any motif matches that don't overlap the variant of interest (and thus
    # can't be affected by the variant and will be the same for ref and alt). Then we'll get the 
    # best match for each motif for ref and alt.
    rows = []
    for motif in ref_res.keys():
        ref_res[motif] = _filter_variant_motif_res(ref_res[motif], ref_variant_start, ref_variant_end, 
                                           matrices[motif].shape[0], ref_seq)
        alt_res[motif] = _filter_variant_motif_res(alt_res[motif], alt_variant_start, alt_variant_end, 
                                           matrices[motif].shape[0], alt_seq)

        if len(ref_res[motif]) > 0:
            ref_pos, ref_score = sorted(ref_res[motif], key=lambda x: x[1], reverse=True)[0]
            ref_strand = {True:'+', False:'-'}[ref_pos > 0]
        else:
            ref_score = 0
            ref_strand = ''
        if len(alt_res[motif]) > 0:
            alt_pos, alt_score = sorted(alt_res[motif], key=lambda x: x[1], reverse=True)[0]
            alt_strand = {True:'+', False:'-'}[alt_pos > 0]
        else:
            alt_score = 0
            alt_strand = ''
        if ref_score > 0 or alt_score > 0:
            diff = ref_score - alt_score
            rows.append([motif, ref_score, ref_strand, alt_score, alt_strand, diff])
    out = pd.DataFrame(rows, columns=['motif', 'ref_score', 'ref_strand', 'alt_score', 
                                      'alt_strand', 'score_diff'])
    out.index = out.motif
    out = out.drop('motif', axis=1)
    out = out[out.score_diff != 0]
    return out
Example #22
0
    def setUp(self):

        # load all the motifs by hand.
        self.motif_matrices = [
            ('zfp4_yrk_3p', [[250, 130, 0, 0, 20, 0, 0, 40, 50],
                             [100, 50, 0, 0, 0, 0, 10, 0, 0],
                             [10, 200, 380, 380, 0, 380, 150, 340, 0],
                             [20, 0, 0, 0, 360, 0, 220, 0, 330]]),
            ('ttgR', [[7.5, 44.5, 0.0, 0.0, 2.5, 47.5, 8.5],
                      [21.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                      [3.5000000000000004, 0.0, 0.0, 50.0, 0.0, 0.0, 0.0],
                      [18.0, 5.5, 50.0, 0.0, 47.5, 2.5, 43.0]]),
            ('cdaR', [[35, 4, 0, 3, 28, 7, 42, 49, 31, 23],
                      [1, 7, 0, 57, 20, 46, 5, 1, 6, 5],
                      [2, 0, 59, 0, 0, 0, 5, 1, 2, 25],
                      [22, 49, 1, 0, 12, 7, 8, 9, 21, 7]]),
            ('p22_cI', [[80, 113, 0, 0, 0, 468, 581, 396, 35],
                        [68, 20, 581, 0, 0, 35, 0, 10, 0],
                        [160, 0, 0, 0, 0, 0, 0, 10, 0],
                        [273, 448, 0, 581, 581, 78, 0, 165, 546]]),
            ('rpol_10', [[23, 373, 105, 210, 210, 0], [43, 0, 66, 51, 97, 19],
                         [19, 3, 51, 55, 37, 11], [316, 25, 179, 85, 57,
                                                   371]]),
            ('zfp7_ZP10165', [[50, 0, 0, 50, 0, 0, 0, 0, 50, 0, 50],
                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                              [0, 0, 0, 0, 0, 50, 50, 50, 0, 50, 0],
                              [0, 50, 50, 0, 50, 0, 0, 0, 0, 0, 0]]),
            ('zfp1_efnba2_3p', [[3, 6, 0, 0, 6, 112, 0, 0, 6],
                                [15, 112, 0, 0, 112, 0, 6, 118, 0],
                                [100, 0, 118, 118, 0, 0, 112, 0, 112],
                                [0, 0, 0, 0, 0, 6, 0, 0, 0]]),
            ('lacI', [[30, 30, 0, 65, 200, 10, 100, 100, 60, 130],
                      [30, 100, 0, 165, 30, 230, 30, 30, 30, 30],
                      [200, 100, 0, 10, 10, 10, 30, 30, 40, 40],
                      [0, 30, 260, 20, 20, 10, 100, 100, 130, 60]]),
            ('tetR', [[120, 40, 300, 100, 220, 100, 190, 50],
                      [200, 0, 20, 45, 100, 20, 50, 70],
                      [20, 300, 50, 20, 20, 200, 100, 150],
                      [50, 50, 20, 225, 50, 70, 50, 120]]),
            ('933W_cI', [[985, 548, 1544, 0, 0, 1252, 0, 149],
                         [0, 801, 0, 0, 0, 20, 1680, 0],
                         [11, 33, 184, 1728, 0, 0, 0, 1393],
                         [732, 346, 0, 0, 1728, 456, 48, 186]]),
            ('zfp6_ZP10363', [[0, 0, 0, 50, 0, 50, 0, 0, 0, 50, 0],
                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                              [0, 50, 0, 0, 50, 0, 0, 50, 50, 0, 50],
                              [50, 0, 50, 0, 0, 0, 50, 0, 0, 0, 0]]),
            ('zfp5_ZN0024', [[50, 0, 50, 0, 0, 0, 50, 0, 0, 50, 0],
                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50],
                             [0, 50, 0, 50, 0, 50, 0, 50, 50, 0, 0],
                             [0, 0, 0, 0, 50, 0, 0, 0, 0, 0, 0]]),
            ('434_cI', [[621, 80, 186, 0, 0, 0, 0], [0, 76, 68, 30, 0, 0, 0],
                        [0, 375, 0, 0, 0, 659, 0],
                        [38, 128, 405, 629, 659, 0, 659]]),
            ('zfp2_dab2_3p', [[8, 178, 108, 0, 70, 158, 0, 150, 0],
                              [0, 0, 0, 0, 0, 0, 0, 20, 8],
                              [170, 0, 50, 178, 108, 20, 178, 0, 170],
                              [0, 0, 20, 0, 0, 0, 0, 8, 0]]),
            ('acuR', [[0.0, 0.0, 0.0, 0.0, 25.0, 21.0, 17.5, 34.5],
                      [0.0, 0.0, 50.0, 2.0, 7.5, 0.0, 0.0, 0.0],
                      [50.0, 0.0, 0.0, 0.0, 0.0, 11.5, 0.0, 2.0],
                      [0.0, 50.0, 0.0, 48.0, 17.5, 17.5, 32.5, 13.5]]),
            ('rpol_35', [[42, 18, 18, 173, 142, 134, 116],
                         [0, 0, 6, 135, 140, 50, 90],
                         [0, 72, 244, 0, 40, 72, 82],
                         [359, 311, 133, 93, 79, 145, 113]]),
            ('zfp8_ZP10457', [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50],
                              [50, 0, 0, 0, 0, 0, 0, 0, 0, 50, 0],
                              [0, 0, 0, 0, 50, 0, 0, 50, 50, 0, 0],
                              [0, 50, 50, 50, 0, 50, 50, 0, 0, 0, 0]]),
            ('lambda_cI', [[50, 170, 45, 85, 30, 110, 0, 300],
                           [310, 50, 0, 0, 0, 0, 0, 0],
                           [50, 190, 365, 5, 380, 195, 0, 65],
                           [0, 0, 0, 320, 0, 105, 410, 45]]),
            ('rpol_10_ext', [[0, 0, 100, 23, 373, 105, 210, 210, 0],
                             [60, 0, 100, 43, 0, 66, 51, 97, 19],
                             [140, 240, 100, 19, 3, 51, 55, 37, 11],
                             [200, 160, 100, 316, 25, 179, 85, 57, 371]])
        ]

        # load all thresholds by hand
        self.thresholds = [
            1.0699545943859903, 0.7897693125951122, 0.6726083744651952,
            1.2680761835227123, 1.2225352937342997, 0.9676501362039236,
            0.87457893821108, 0.8775565485963774, 0.7705603886602805,
            1.4143797568075485, 0.9676501362039236, 0.9676501362039236,
            1.2748406244294834, 1.024286840459233, 0.6836163375410198,
            1.4530032582649177, 0.9676501362039236, 1.1239104755269196,
            2.6755385519992174
        ]

        # generate the moodssearch object.
        self.moodssearch_obj_fwd = MOODS.MOODSSearch(
            [x[1] for x in self.motif_matrices], self.thresholds,
            MOODS.flatbg(), 7, True, False)

        self.moodssearch_obj_both = MOODS.MOODSSearch(
            [x[1] for x in self.motif_matrices], self.thresholds,
            MOODS.flatbg(), 7, True, True)

        #apFAB46 sequence
        self.apFAB46_seq = ('AAAAAGACAATGAAAAGCTTAGTCATGGCGCGCCAAAAAGAGTATTG'
                            'ACTTCGCATCTTTTTGTACCTATAATAGATTCATTGCTA')

        #These are the proper hits.
        self.apFAB46_hits = {
            '434_cI': [(57, 4.688399966083337), (-6, 3.3910215941638624)],
            '933W_cI': [(-61, 4.046026313566105)],
            'acuR': [(54, 2.0456700959741143), (-69, 3.325251483672279),
                     (-34, 2.045670095974115), (-0.0, 7.348975004033191)],
            'cdaR': [(29, 3.0419647953759767), (-16, 0.7214691456808224)],
            'lacI': [(-57, 1.8194505679247397), (-40, 1.288724504297107)],
            'lambda_cI': [(39, 1.9439365463638196)],
            'p22_cI': [],
            'rpol_10': [(5, 2.0391667799057487), (67, 5.8335347738270835),
                        (72, 3.2805992559471626), (-68, 4.930825591137652),
                        (-54, 1.3544349004462848), (-40, 2.4522945494642174),
                        (-38, 3.1644672601697525), (-15, 3.193650492100148),
                        (-9, 1.7285308122946124)],
            'rpol_10_ext': [(-40, 3.263862791624584)],
            'rpol_35': [(9, 2.494746699845068), (44, 4.673608724636109),
                        (57, 2.1790487168281887), (58, 2.226020656779173),
                        (60, 2.779856657750668), (61, 1.54573352717996),
                        (75, 1.6937781338634328), (-79, 1.524973502802859),
                        (-74, 2.3203399149336397), (-32, 1.8592143421105267),
                        (-19, 2.55933983358448), (-9, 3.369439254870037),
                        (-8, 3.834192305019384), (-3, 4.054157145090022),
                        (-2, 1.7887495239839581)],
            'tetR': [(0, 2.35082455257586), (4, 2.301877909267698),
                     (9, 0.853460927069236), (10, 2.155196197307788),
                     (33, 1.1931970179468807), (34, 3.1110648177773217),
                     (66, 1.533984872075769), (69, 2.8253186216029587),
                     (73, 1.9382219304721406), (-75, 1.129366669586711),
                     (-64, 2.4065472341224545), (-55, 1.1931970179468807),
                     (-54, 2.8883371533478757), (-17, 1.282891279146519)],
            'ttgR': [(8, 3.09623693618527), (59, 1.1998476766957555),
                     (-75, 4.694180469193636), (-71, 2.2452537163828197)],
            'zfp1_efnba2_3p': [],
            'zfp2_dab2_3p': [],
            'zfp4_yrk_3p': [],
            'zfp5_ZN0024': [],
            'zfp6_ZP10363': [],
            'zfp7_ZP10165': [],
            'zfp8_ZP10457': []
        }

        # hits computed by the moods search object
        self.raw_hits = self.moodssearch_obj_both.search(self.apFAB46_seq)
Example #23
0
def main():
    p = optparse.OptionParser(__doc__)
    p.add_option('-t', '--thresh', action='store', dest='threshold',
                 default=0.0,help='determines threshold')
    p.add_option('-a', '--append', action='store', dest='name',
                 default='resultsfor', help='appends pwm name to this when\
                 creating files')
    p.add_option('-A', '--absolute', action='store_true',dest='A',
                 default=False,help='absolute threshold')
    p.add_option('-s','--standard_background',action='store_true',dest='stdbg')
    p.add_option('-M', '--specific_Matrix', action='store', dest='specific')
    options, args = p.parse_args()

    pwm = open(args[0], 'rU')
    fa = open(args[1], 'rU')
    pfa = list(Bio.SeqIO.parse(fa, 'fasta'))
    index, matricies, sizes = pySeq.parsing.PWMparser.parse(pwm)

    underorequal20 = []
    over20 = []
    under20names = []
    over20names = []
    pwmdata={}
    fileout = {}
    bgt = False
    if options.stdbg:
        bgt = [0.25,0.25,0.25,0.25]

    # Construct Matrices to search and files to write to.
    for k in index.keys():
        if options.specific:
            if k == options.specific:
                filename = options.name + k + '.bed'
                fileout[k] = open(filename, 'w') 
                if sizes[k] <= 20:
                    underorequal20.append(matricies[k])
                    under20names.append(k)
                else:
                    over20.append(matricies[k])
                    over20names.append(k)
            else:
                filename = options.name + k + '.bed'
                fileout[k] = open(filename, 'w')
                if sizes[k] <= 20:
                    underorequal20.append(matricies[k])
                    under20names.append(k)
                else:
                    over20.append(matricies[k])
                    over20names.append(k)

    for chrom in pfa:
        print(chrom.name)
        #Run under 20s
        # Should we sort the results as all downstream applications require a
        # sort first
        res = MOODS.search(chrom.seq, underorequal20, float(options.threshold),
                           absolute_threshold=options.A , both_strands = True,
                           bg=bgt, algorithm='lf')

        for n,r in enumerate(res):
            for position,score in r:
                start, end, strand = strand_adjust(position,
                                                   sizes[under20names[n]])
                # Add option to round the score values.  Defaulting to int atm
                # since bedToBigBed only accepts integer values....
                fileout[under20names[n]].write('\t'.join([chrom.name,
                                                          str(start), str(end),
                                                          under20names[n],
                                                          str(int(score*100)), strand,
                                                          '\n']))

        #Run over 20s
        res = MOODS.search(chrom.seq, over20, float(options.threshold),
                           absolute_threshold=options.A , both_strands = True,
                           bg=bgt, algorithm='supera')


        for n,r in enumerate(res):
            for position,score in r:
                start, end, strand = strand_adjust(position, 
                                                   sizes[over20names[n]])
                fileout[over20names[n]].write('\t'.join([chrom.name, str(start),
                                                         str(end),over20names[n],
                                                         str(int(score*100)),strand,
                                                         '\n']))
Example #24
0
import fasta

DIST_DIR = abspath(dirname(dirname(LOCAL_DIR)))
print(DIST_DIR)
fasta_filepath = join(DIST_DIR, "examples/data/sequence/dnaACGT.txt")
records = fasta.parseFasta(fasta_filepath)

seq = records[0][1]

matrix1 = [[0, 1, 0, 0, 0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 0, 0, 1]]
matrix2 = [[10, 0, 10, 3, 5, 5], [0, 5, 0, 3, 5, 0, 5], [0, 1, 0, 3, 0, 5, 0],
           [0, 4, 0, 1, 0, 0, 5]]

results = MOODS.search(seq, [matrix1, matrix2], 0.011)

print("Matrix 1 results: " + str(len(results[0])))
print("Matrix 2 results: " + str(len(results[1])))

matrices = [matrix1, matrix2]
thresholds = [0.011, 0.011]
bg = MOODS.bg_from_sequence(seq, 0.1)
q = 7
absolute_threshold = False
both_strands = False
ms = MOODS.MOODSSearch(matrices, thresholds, bg, q, absolute_threshold,
                       both_strands)
results = ms.search(seq)

print("New Matrix 1 results: " + str(len(results[0])))
Example #25
0
def count(enhlist, c, p_val):
	write_match = False
	index = 0
	seq = ""
	while index <= len(enhlist) - 1 and not enhlist[index].startswith('>'):
		seq += enhlist[index]
		seq = seq.replace('\n','')
		index += 1
	index = 0
	results = []
	pseudocount = 0.001
#	starrmot15
#	caca = [[49,0,100,0,91,0,100,0],[1,83,0,88,0,49,0,94],[49,1,0,12,0,9,0,6],[0,16,0,0,9,41,0,0,]]
	caca = [[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0],[0,0,0,0,0,0]]
	me137 = [[0,1,0,1,0,1],[0,0,0,0,0,0],[1,0,1,0,1,0],[0,0,0,0,0,0]]
	gcgc = [[0,0,0,0,0,0],[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0]]
	tata = [[0,1,0,1,0,1],[0,0,0,0,0,0],[0,0,0,0,0,0],[1,0,1,0,1,0]]
	matrices = [me137,caca,gcgc,tata]

	if (c == "default"):
		results = MOODS.search(seq,matrices,p_val,both_strands=True)

	elif (c == "human"):
		BG_hum =[0.29508855202553025, 0.20466109233964447, 0.20478482916547036, 0.2954655264693549]
	  	matrices = [MOODS.count_log_odds(matrix,BG_hum,pseudocount) for matrix in matrices]
	  	thresholds = [MOODS.threshold_from_p(matrix, BG_hum, p_val) for matrix in matrices]
	  	results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True)

	elif (c == "equal"):
		BG_eq = [0.25,0.25,0.25,0.25]
	  	matrices = [MOODS.count_log_odds(matrix,BG_eq,pseudocount) for matrix in matrices]
	  	thresholds = [MOODS.threshold_from_p(matrix, BG_eq, p_val) for matrix in matrices]
	  	results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True)

	else:
		write_match = False
		ga = re.compile(r'GAGA(?:GA)+',re.IGNORECASE)
		ct = re.compile(r'TCTC(?:TC)+',re.IGNORECASE)
		ca = re.compile(r'CACA(?:CA)+',re.IGNORECASE)
		gt = re.compile(r'TGTG(?:TG)+',re.IGNORECASE)
		gc = re.compile(r'GCGC(?:GC)+',re.IGNORECASE)
		cg = re.compile(r'CGCG(?:CG)+',re.IGNORECASE)
		ta = re.compile(r'TATA(?:TA)+',re.IGNORECASE)
		at = re.compile(r'ATAT(?:AT)+',re.IGNORECASE)

		ga_occur = re.findall(ga,seq) + re.findall(ct,seq)
		ca_occur = re.findall(ca,seq) + re.findall(gt,seq)
		gc_occur = re.findall(gc,seq) + re.findall(cg,seq)
		ta_occur = re.findall(ta,seq) + re.findall(at,seq)
		results = [ga_occur, ca_occur, gc_occur, ta_occur]
#	write files containing the matches for each motif as found by MOODS
#   only if boolean set at top of count() is True
	if write_match:
		ga = open('%sga_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		ca = open('%sca_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		gc = open('%sgc_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		ta = open('%sta_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		comp = False
		for match in results[0]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)
			for i in range(len(me137[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			ga.write('%s\n' % st)

		for match in results[1]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)
			for i in range(len(caca[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			ca.write('%s\n' % st)

		for match in results[2]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)
			for i in range(len(gcgc[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			gc.write('%s\n' % st)

		for match in results[3]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)

			for i in range(len(tata[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			ta.write('%s\n' % st)

		ga.close()
		ca.close()
		gc.close()
		ta.close()
	results.append(len(seq))
	return results
Example #26
0
def find_motif_disruptions(
    position,
    ref,
    alt,
    genome_fasta,
    matrices,
):
    """
    Determine whether there is a difference between the ref and alt
    alleles for TF binding. Requires samtools in your path.
    
    Parameters
    ----------
    position : str
        Zero based genomic coordinates of the reference allele of the form
        chrom:start-end (chr5:100-101 for a SNV for instance). The value end -
        start should equal the length of the ref allele.

    ref : str
        Reference allele. This should match the reference sequence at "position"
        in genome_fasta.

    alt : str
        Alternate allele.

    genome_fasta : str
        Path to genome fasta file. This file should be indexed.
    
    matrices : dict
        Dict whose keys are motif names and whose values are pandas data frames 
        or numpy arrays containing PWMs with columns ACGT.

    Returns
    -------
    out : pandas.DataFrame
        Pandas data frame with motifs whose best matches that overlapped the
        variant differed between the reference and alternate sequences. A score
        of zero and a strand of '' indicates that there was not a match for the
        motif on the given allele.

    """
    import subprocess
    import MOODS
    # import pybedtools as pbt
    max_motif_length = max([x.shape[0] for x in matrices.values()])
    chrom, coords = position.split(':')
    start, end = [int(x) for x in coords.split('-')]
    s = '{}:{}-{}'.format(chrom, start - max_motif_length + 1,
                          end + max_motif_length - 1)
    c = 'samtools faidx {} {}'.format(genome_fasta, s)
    seq_lines = subprocess.check_output(c, shell=True).strip().split()
    ref_seq = seq_lines[1]
    alt_seq = ref_seq[0:max_motif_length -
                      1] + alt + ref_seq[max_motif_length + len(ref) - 1:]

    ref_variant_start = max_motif_length - 1
    ref_variant_end = max_motif_length - 1 + len(ref)
    alt_variant_start = max_motif_length - 1
    alt_variant_end = max_motif_length - 1 + len(alt)

    ms = [matrices[x].T.values.tolist() for x in matrices.keys()]
    ref_res = MOODS.search(ref_seq,
                           ms,
                           0.001,
                           both_strands=True,
                           bg=[0.25, 0.25, 0.25, 0.25])
    ref_res = dict(zip(matrices.keys(), ref_res))
    alt_res = MOODS.search(alt_seq,
                           ms,
                           0.001,
                           both_strands=True,
                           bg=[0.25, 0.25, 0.25, 0.25])
    alt_res = dict(zip(matrices.keys(), alt_res))

    # First we'll remove any motif matches that don't overlap the variant of interest (and thus
    # can't be affected by the variant and will be the same for ref and alt). Then we'll get the
    # best match for each motif for ref and alt.
    rows = []
    for motif in ref_res.keys():
        ref_res[motif] = _filter_variant_motif_res(ref_res[motif],
                                                   ref_variant_start,
                                                   ref_variant_end,
                                                   matrices[motif].shape[0],
                                                   ref_seq)
        alt_res[motif] = _filter_variant_motif_res(alt_res[motif],
                                                   alt_variant_start,
                                                   alt_variant_end,
                                                   matrices[motif].shape[0],
                                                   alt_seq)

        if len(ref_res[motif]) > 0:
            ref_pos, ref_score = sorted(ref_res[motif],
                                        key=lambda x: x[1],
                                        reverse=True)[0]
            ref_strand = {True: '+', False: '-'}[ref_pos > 0]
        else:
            ref_score = 0
            ref_strand = ''
        if len(alt_res[motif]) > 0:
            alt_pos, alt_score = sorted(alt_res[motif],
                                        key=lambda x: x[1],
                                        reverse=True)[0]
            alt_strand = {True: '+', False: '-'}[alt_pos > 0]
        else:
            alt_score = 0
            alt_strand = ''
        if ref_score > 0 or alt_score > 0:
            diff = ref_score - alt_score
            rows.append(
                [motif, ref_score, ref_strand, alt_score, alt_strand, diff])
    out = pd.DataFrame(rows,
                       columns=[
                           'motif', 'ref_score', 'ref_strand', 'alt_score',
                           'alt_strand', 'score_diff'
                       ])
    out.index = out.motif
    out = out.drop('motif', axis=1)
    out = out[out.score_diff != 0]
    return out
Example #27
0
    def setUp(self):

        # load all the motifs by hand.
        self.motif_matrices = [
             ('zfp4_yrk_3p',
              [[250, 130, 0, 0, 20, 0, 0, 40, 50],
               [100, 50, 0, 0, 0, 0, 10, 0, 0],
               [10, 200, 380, 380, 0, 380, 150, 340, 0],
               [20, 0, 0, 0, 360, 0, 220, 0, 330]]),
             ('ttgR',
              [[7.5, 44.5, 0.0, 0.0, 2.5, 47.5, 8.5],
               [21.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
               [3.5000000000000004, 0.0, 0.0, 50.0, 0.0, 0.0, 0.0],
               [18.0, 5.5, 50.0, 0.0, 47.5, 2.5, 43.0]]),
             ('cdaR',
              [[35, 4, 0, 3, 28, 7, 42, 49, 31, 23],
               [1, 7, 0, 57, 20, 46, 5, 1, 6, 5],
               [2, 0, 59, 0, 0, 0, 5, 1, 2, 25],
               [22, 49, 1, 0, 12, 7, 8, 9, 21, 7]]),
             ('p22_cI',
              [[80, 113, 0, 0, 0, 468, 581, 396, 35],
               [68, 20, 581, 0, 0, 35, 0, 10, 0],
               [160, 0, 0, 0, 0, 0, 0, 10, 0],
               [273, 448, 0, 581, 581, 78, 0, 165, 546]]),
             ('rpol_10',
              [[23, 373, 105, 210, 210, 0],
               [43, 0, 66, 51, 97, 19],
               [19, 3, 51, 55, 37, 11],
               [316, 25, 179, 85, 57, 371]]),
             ('zfp7_ZP10165',
              [[50, 0, 0, 50, 0, 0, 0, 0, 50, 0, 50],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0, 50, 50, 50, 0, 50, 0],
               [0, 50, 50, 0, 50, 0, 0, 0, 0, 0, 0]]),
             ('zfp1_efnba2_3p',
              [[3, 6, 0, 0, 6, 112, 0, 0, 6],
               [15, 112, 0, 0, 112, 0, 6, 118, 0],
               [100, 0, 118, 118, 0, 0, 112, 0, 112],
               [0, 0, 0, 0, 0, 6, 0, 0, 0]]),
             ('lacI',
              [[30, 30, 0, 65, 200, 10, 100, 100, 60, 130],
               [30, 100, 0, 165, 30, 230, 30, 30, 30, 30],
               [200, 100, 0, 10, 10, 10, 30, 30, 40, 40],
               [0, 30, 260, 20, 20, 10, 100, 100, 130, 60]]),
             ('tetR',
              [[120, 40, 300, 100, 220, 100, 190, 50],
               [200, 0, 20, 45, 100, 20, 50, 70],
               [20, 300, 50, 20, 20, 200, 100, 150],
               [50, 50, 20, 225, 50, 70, 50, 120]]),
             ('933W_cI',
              [[985, 548, 1544, 0, 0, 1252, 0, 149],
               [0, 801, 0, 0, 0, 20, 1680, 0],
               [11, 33, 184, 1728, 0, 0, 0, 1393],
               [732, 346, 0, 0, 1728, 456, 48, 186]]),
             ('zfp6_ZP10363',
              [[0, 0, 0, 50, 0, 50, 0, 0, 0, 50, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               [0, 50, 0, 0, 50, 0, 0, 50, 50, 0, 50],
               [50, 0, 50, 0, 0, 0, 50, 0, 0, 0, 0]]),
             ('zfp5_ZN0024',
              [[50, 0, 50, 0, 0, 0, 50, 0, 0, 50, 0],
               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50],
               [0, 50, 0, 50, 0, 50, 0, 50, 50, 0, 0],
               [0, 0, 0, 0, 50, 0, 0, 0, 0, 0, 0]]),
             ('434_cI',
              [[621, 80, 186, 0, 0, 0, 0],
               [0, 76, 68, 30, 0, 0, 0],
               [0, 375, 0, 0, 0, 659, 0],
               [38, 128, 405, 629, 659, 0, 659]]),
             ('zfp2_dab2_3p',
              [[8, 178, 108, 0, 70, 158, 0, 150, 0],
               [0, 0, 0, 0, 0, 0, 0, 20, 8],
               [170, 0, 50, 178, 108, 20, 178, 0, 170],
               [0, 0, 20, 0, 0, 0, 0, 8, 0]]),
             ('acuR',
              [[0.0, 0.0, 0.0, 0.0, 25.0, 21.0, 17.5, 34.5],
               [0.0, 0.0, 50.0, 2.0, 7.5, 0.0, 0.0, 0.0],
               [50.0, 0.0, 0.0, 0.0, 0.0, 11.5, 0.0, 2.0],
               [0.0, 50.0, 0.0, 48.0, 17.5, 17.5, 32.5, 13.5]]),
             ('rpol_35',
              [[42, 18, 18, 173, 142, 134, 116],
               [0, 0, 6, 135, 140, 50, 90],
               [0, 72, 244, 0, 40, 72, 82],
               [359, 311, 133, 93, 79, 145, 113]]),
             ('zfp8_ZP10457',
              [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50],
               [50, 0, 0, 0, 0, 0, 0, 0, 0, 50, 0],
               [0, 0, 0, 0, 50, 0, 0, 50, 50, 0, 0],
               [0, 50, 50, 50, 0, 50, 50, 0, 0, 0, 0]]),
             ('lambda_cI',
              [[50, 170, 45, 85, 30, 110, 0, 300],
               [310, 50, 0, 0, 0, 0, 0, 0],
               [50, 190, 365, 5, 380, 195, 0, 65],
               [0, 0, 0, 320, 0, 105, 410, 45]]),
             ('rpol_10_ext',
              [[0, 0, 100, 23, 373, 105, 210, 210, 0],
               [60, 0, 100, 43, 0, 66, 51, 97, 19],
               [140, 240, 100, 19, 3, 51, 55, 37, 11],
               [200, 160, 100, 316, 25, 179, 85, 57, 371]])
        ]

        # load all thresholds by hand
        self.thresholds = [
                 1.0699545943859903,
                 0.7897693125951122,
                 0.6726083744651952,
                 1.2680761835227123,
                 1.2225352937342997,
                 0.9676501362039236,
                 0.87457893821108,
                 0.8775565485963774,
                 0.7705603886602805,
                 1.4143797568075485,
                 0.9676501362039236,
                 0.9676501362039236,
                 1.2748406244294834,
                 1.024286840459233,
                 0.6836163375410198,
                 1.4530032582649177,
                 0.9676501362039236,
                 1.1239104755269196,
                 2.6755385519992174
        ]

        # generate the moodssearch object.
        self.moodssearch_obj_fwd = MOODS.MOODSSearch(
            [x[1] for x in self.motif_matrices],
            self.thresholds,
            MOODS.flatbg(),
            7,
            True,
            False)

        self.moodssearch_obj_both = MOODS.MOODSSearch(
            [x[1] for x in self.motif_matrices],
            self.thresholds,
            MOODS.flatbg(),
            7,
            True,
            True)

        #apFAB46 sequence
        self.apFAB46_seq = ('AAAAAGACAATGAAAAGCTTAGTCATGGCGCGCCAAAAAGAGTATTG'
                'ACTTCGCATCTTTTTGTACCTATAATAGATTCATTGCTA')

        #These are the proper hits.
        self.apFAB46_hits = {
         '434_cI': [(57, 4.688399966083337), (-6, 3.3910215941638624)],
         '933W_cI': [(-61, 4.046026313566105)],
         'acuR': [(54, 2.0456700959741143),
                  (-69, 3.325251483672279),
                  (-34, 2.045670095974115),
                  (-0.0, 7.348975004033191)],
         'cdaR': [(29, 3.0419647953759767), (-16, 0.7214691456808224)],
         'lacI': [(-57, 1.8194505679247397), (-40, 1.288724504297107)],
         'lambda_cI': [(39, 1.9439365463638196)],
         'p22_cI': [],
         'rpol_10': [(5, 2.0391667799057487),
                     (67, 5.8335347738270835),
                     (72, 3.2805992559471626),
                     (-68, 4.930825591137652),
                     (-54, 1.3544349004462848),
                     (-40, 2.4522945494642174),
                     (-38, 3.1644672601697525),
                     (-15, 3.193650492100148),
                     (-9, 1.7285308122946124)],
         'rpol_10_ext': [(-40, 3.263862791624584)],
         'rpol_35': [(9, 2.494746699845068),
                     (44, 4.673608724636109),
                     (57, 2.1790487168281887),
                     (58, 2.226020656779173),
                     (60, 2.779856657750668),
                     (61, 1.54573352717996),
                     (75, 1.6937781338634328),
                     (-79, 1.524973502802859),
                     (-74, 2.3203399149336397),
                     (-32, 1.8592143421105267),
                     (-19, 2.55933983358448),
                     (-9, 3.369439254870037),
                     (-8, 3.834192305019384),
                     (-3, 4.054157145090022),
                     (-2, 1.7887495239839581)],
         'tetR': [(0, 2.35082455257586),
                  (4, 2.301877909267698),
                  (9, 0.853460927069236),
                  (10, 2.155196197307788),
                  (33, 1.1931970179468807),
                  (34, 3.1110648177773217),
                  (66, 1.533984872075769),
                  (69, 2.8253186216029587),
                  (73, 1.9382219304721406),
                  (-75, 1.129366669586711),
                  (-64, 2.4065472341224545),
                  (-55, 1.1931970179468807),
                  (-54, 2.8883371533478757),
                  (-17, 1.282891279146519)],
         'ttgR': [(8, 3.09623693618527),
                  (59, 1.1998476766957555),
                  (-75, 4.694180469193636),
                  (-71, 2.2452537163828197)],
         'zfp1_efnba2_3p': [],
         'zfp2_dab2_3p': [],
         'zfp4_yrk_3p': [],
         'zfp5_ZN0024': [],
         'zfp6_ZP10363': [],
         'zfp7_ZP10165': [],
         'zfp8_ZP10457': []}

        # hits computed by the moods search object
        self.raw_hits = self.moodssearch_obj_both.search(self.apFAB46_seq)
Example #28
0
def main():
    """
    The main loop.  Lets ROCK!
    """
    
    desc = """... ask me later! I'm on a deadline! ..."""
    
    parser = argparse.ArgumentParser(description=desc)
    
    parser.add_argument('--seqs', type=str,
                        help="""Path to a fasta file containing the 'promoter'
                        regions of a single species you wish to scan with motifs.""")
    
    parser.add_argument('--species', type=str,
                        help="""A quoted string of the species name: 'Anophles gambiae'.""")
    
    parser.add_argument('--motifs', type=str,
                        help="""Path to a file containing the motifs you wish to use.  
                        The file must be in JASPAR's 'matrix_only.txt' format.""")
    
    parser.add_argument('--thresh', type=float, required=False, default=0.001,
                        help="""A p-val cut-off above which hits will be ignored. (default = %(default)s)""")
    
    parser.add_argument('--out', type=str, required=False, default='compare_motifs.out',
                        help="""Path to outfile. (default = %(default)s)""")
    
    parser.add_argument('--norm', type=str, required=False, default=False,
                        help="""Optional path to outfile for data normalized by upper quartiles w.r.t. each motif. (default = %(default)s)""")

    parser.add_argument('--to-norm', type=str, required=False, default=False,
                            help="""Optional path to outfile of previous run that needs to be normalized. (default = %(default)s)""")
    

    
    args = parser.parse_args()
    
    
    if not args.to_norm:
        # create parsers
        motifs = ParseJasparMatrixOnly(args.motifs)
        seqs   = ParseFastA(args.seqs)
    
        # Load all motifs at once
        # We will be loading one seq at a time.
        motifs = motifs.to_dict()
    
        # set up output and headers
        headers  = 'seq_name\tspecies\t%s\n' % ('\t'.join(motifs.keys()))
        out_file = open(args.out,'w')
        out_file.write(headers)
    
    # lets start the major looping
    if args.norm and not args.to_norm:
        norm_dict = OrderedDict()
    elif args.to_norm:
        norm_dict,headers = load_to_normalize(args.to_norm)
        write_normalized_table(headers,args.norm,norm_dict)
        exit(0)
        
    for name,seq in seqs:
        hits = MOODS.search(seq,motifs.values(),args.thresh)
        counts = [len(x) for x in hits]
        out_file.write('%s\t%s\t%s\n' % (name,args.species,'\t'.join([str(x) for x in counts])))
        if args.norm:
            norm_dict['%s\t%s' % (name,args.species)] = np.array(counts)
    out_file.close()
    
    if args.norm:
        write_normalized_table(headers,args.norm,norm_dict)
Example #29
0
           [0,0,0,1]]

teststring = 'acgtacgt'
'''

handle = open('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.fasta', "r")
records = list(Bio.SeqIO.parse(handle, "fasta"))
handle.close()
seq = records[0]
teststring=seq.seq




print('both strands')
results = MOODS.search(teststring, [matrix], thresholds=1, absolute_threshold=False, both_strands=True)

for i in results:
    for (position, score) in i:
        print("Position: " + str(position) + " Score: "+ str(score))
        
        
        
print('one way')
results = MOODS.search(teststring, [matrix], thresholds=1, absolute_threshold=False, both_strands=False)

for i in results:
    for (position, score) in i:
        print("Position: " + str(position) + " Score: "+ str(score))
        
        
Example #30
0
import MOODS

import Bio.Seq
import Bio.SeqIO

handle = open("data/sequence/dnaACGT.txt", "r")
records = list(Bio.SeqIO.parse(handle, "fasta"))
handle.close()

seq = records[0]

matrix1 = [     [0,1,0,0,0,0,0,1,1,0],
                [1,0,0,0,0,0,0,0,0,0],
                [0,0,0,0,0,0,0,0,0,0],
                [0,0,1,1,1,1,1,0,0,1]
            ]
matrix2 = [     [10,0,10,3,5,5],
                [0,5,0,3,5,0,5],
                [0,1,0,3,0,5,0],
                [0,4,0,1,0,0,5]
            ]

results = MOODS.search(seq.seq, [matrix1, matrix2], 0.011)

print("Matrix 1 results: "+ str(len(results[0])))
print("Matrix 2 results: "+ str(len(results[1])))
Example #31
0
import MOODS

import Bio.Seq
import Bio.SeqIO

handle = open("data/sequence/dnaACGT.txt", "r")
records = list(Bio.SeqIO.parse(handle, "fasta"))
handle.close()

seq = records[0]

matrix1 = [[0, 1, 0, 0, 0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 0, 0, 1]]
matrix2 = [[10, 0, 10, 3, 5, 5], [0, 5, 0, 3, 5, 0, 5], [0, 1, 0, 3, 0, 5, 0],
           [0, 4, 0, 1, 0, 0, 5]]

results = MOODS.search(seq.seq, [matrix1, matrix2], 0.011)

print("Matrix 1 results: " + str(len(results[0])))
print("Matrix 2 results: " + str(len(results[1])))