def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True): for fastaf in fastafile: thisseqname = fastaf.split('/')[-1].split('.')[0] handle = open(fastaf, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() thisseq = records[0].seq print 'Doing sequence ', thisseqname, 'length=', len(thisseq) for pwmf in pwmfiles: thispwmname = pwmf.split('/')[-1] print ' Doing pwm ', thispwmname thispwm = MOODS.load_matrix(pwmf) thispwmcomplement = MOODS.reverse_complement(thispwm) print ' strand 1' onestrandindexvector=getMOODSscore(thisseq, thispwm) print ' strand 2' otherstrandindexvecor=getMOODSscore(thisseq, thispwmcomplement) print ' finding best score per bp' bothstrandsindexvector = np.append( onestrandindexvector, otherstrandindexvecor, axis=0) bestscorevector = getMaxPWMScore( bothstrandsindexvector, len(thispwm[0])) for strandnbr in range(len(bothstrandsindexvector)): print ' writing wiggle for strand', str(strandnbr) vegardswritewiggle(bothstrandsindexvector[strandnbr,:], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname+ '/strand_'+str(strandnbr)) print ' writing wiggle for bestscore' vegardswritewiggle(bestscorevector, name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
def getMOODSscore(seqfile, pwmfiles, both_strands=False): handle = open(seqfile, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() seq = records[0].seq print 'len(seq)=', len(seq) matrixlist = list() for f in pwmfiles: matrix = MOODS.load_matrix(f) print 'pwm ', f, 'windowlength=', len(matrix[0]) matrixlist.append(matrix) if both_strands: matrixlist.append( MOODS.reverse_complement(matrix) ) # both_strand option in MOODS returned a akward result. print 'starting MOODS.search', datetime.now() results = MOODS.search(seq, matrixlist, thresholds=1, absolute_threshold=False) print 'done MOODS.search', datetime.now() reslist = [] for n in range(len(pwmfiles)): thisind = n * (1 + both_strands) reslist.append(vegardparseMOODSres(results[thisind], len(seq))) if both_strands: reslist[n] = np.append(reslist[n], vegardparseMOODSres(results[thisind + 1], len(seq)), axis=0) return (reslist)
def MOODS_search(seq, motif, thresholds=0): """an equivalent to Motif.search_pwm()""" if not USE_MOODS: raise RuntimeError("MOODS could not be imported") sequence = seq matrix_ = MOODS.transpose([map(lambda x: x[1], sorted(x.items())) for x in motif.log_odds()]) # Note: algorithm = 'lf' fails due to segmentation fault results_per_matrix = MOODS.search( sequence, [matrix_], thresholds, bg=None, algorithm="pla", q=7, absolute_threshold=True, both_strands=True, combine=True, ) # format as Motif.search_pwm results search_results = results_per_matrix[0] # figure out direction of reverse results # do we need to reverse it? results_sorted_like_Bio_Motif = sorted( search_results, key=operator.itemgetter(0), cmp=lambda x, y: cmp(abs(x), abs(y)) ) return results_sorted_like_Bio_Motif
def getPWMscores(regions, PWM, fasta, regionsFile): """ Score every basepairof a set of regions with a match to a PWM. :param regions: `bedtools.BedTool` with regions to look at. :type regions: bedtools.BedTool :param PWM: :type PWM: :param fasta: Fasta file with genome sequence. :type fasta: str :param regionsFile: :type regionsFile: """ import MOODS import numpy as np import pandas as pd # Get nucleotides seq = regions.sequence(s=True, fi=fasta) with open(seq.seqfn) as handle: seqs = handle.read().split("\n")[1::2] # get fasta sequences # Match strings with PWM scores = list() for sequence in seqs: result = MOODS.search(sequence, [PWM], 30) scores.append(np.array([j for i, j in result[0]])) names = open(regionsFile).read().split("\n") names.pop(-1) names = [line.split("\t")[3] for line in names] return pd.DataFrame(scores, index=names, columns=range(-990, 990))
def getMOODSscore_old(seq, mat): results = MOODS.search(seq, [mat], thresholds=1, absolute_threshold=False) resarray = np.zeros((1, len(seq)), dtype=np.dtype('Float32')) resarray[:, :] = -100 for (position, score) in results[0]: resarray[0, position] = score return (resarray)
def getMOODSscore_old(seq, mat): results = MOODS.search(seq, [mat], thresholds=1, absolute_threshold=False) resarray = np.zeros( (1, len(seq)), dtype=np.dtype('Float32')) resarray[:,:] = -100 for (position, score) in results[0]: resarray[0, position] = score return(resarray)
def search(consensus_list,TF,master,search_region,p): count = 0 count1 = 0 interaction_only = False try: if sys.argv[3] == "True": interaction_only = True else: interaction_only = False except IndexError: pass duplicate = [] threshold = [] header = '' for i in master: count +=1 threshold += [p] print >> sys.stderr, count #print threshold for region in search_region: if 'strand' in region: continue if '>' in region: header = region else: for i in range(len(master)): tf = TF[i] #print tf,consensus_list[i] tf_length = len(consensus_list[i]) #result = MOODS.search(region,master,threshold,absolute_threshold=threshold) result = MOODS.search(region,master,p) for check in range(len(result)): for j in range(len(result[check])): position = result[check][j][0] tf_length = len(consensus_list[check]) if result[check][j] == []: continue if interaction_only: if [TF[check],header] not in duplicate: duplicate += [[TF[check],header]] print tf.strip(),header.strip()[1:] print '' else: if [TF[check],header,position] not in duplicate: duplicate+= [[TF[check],header,position]] print TF[check].strip(),header.strip()[1:] print 'position:',position align(consensus_list[check],region[position:position+tf_length]) #print consensus_list[i],'Matched the motif in the upstream region:',region[position:position+tf_length] print ''
def ProcessSeqs(SEQ_HANDLE, PWMS, THRESHOLD, WANT_REV=False, bg=None): """Yields matches on sequences in an 'interval' formatted dictionary""" pwm_names = map(lambda x: x[0], PWMS) pwm_mats = map(lambda x: x[1], PWMS) thresh = map(lambda x: MOODS.threshold_from_p(x, bg, THRESHOLD), pwm_mats) for interval in ReadInterval(SEQ_HANDLE): print interval['NAME'] results = MOODS.search(interval['SEQ'].upper(), pwm_mats, thresh, both_strands=WANT_REV, algorithm='lf', absolute_threshold=True, bg=bg) for res, pwm_name, pwm_mat, th in zip(results, pwm_names, pwm_mats, thresh): width = len(pwm_mat[0]) for position, score in res: if score > th: yield { 'NAME': interval['NAME'], 'START': int(interval['START']) + position, 'END': int(interval['START']) + width + position, 'STRAND': interval['STRAND'], 'PWM': pwm_name, 'SCORE': score, 'CHROM': interval['CHROM'], 'SEQ': interval['SEQ'][position:(position + width)].upper() } else: print 'got bad result'
def getMOODSscore(seqfile, pwmfiles, both_strands=False): handle = open(seqfile, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() seq = records[0].seq print 'len(seq)=',len(seq) matrixlist=list() for f in pwmfiles: matrix = MOODS.load_matrix(f) print 'pwm ', f , 'windowlength=', len(matrix[0]) matrixlist.append(matrix) if both_strands: matrixlist.append(MOODS.reverse_complement(matrix)) # both_strand option in MOODS returned a akward result. print 'starting MOODS.search', datetime.now() results = MOODS.search(seq, matrixlist, thresholds=1, absolute_threshold=False) print 'done MOODS.search', datetime.now() reslist=[] for n in range(len(pwmfiles)): thisind = n * (1 + both_strands) reslist.append(vegardparseMOODSres( results[thisind] , len(seq))) if both_strands: reslist[n] = np.append( reslist[n] , vegardparseMOODSres( results[thisind+1] , len(seq)), axis=0) return(reslist)
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True): for fastaf in fastafile: ### seqence only needed for length here. MOODS does this parsing again later but without reporting length. thisseqname = fastaf.split('/')[-1].split('.')[0] handle = open(fastaf, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() thisseq = records[0].seq print 'Doing sequence ', thisseqname, 'length=', len(thisseq) for pwmf in pwmfiles: thispwmname = pwmf.split('/')[-1] thispwm = MOODS.load_matrix(pwmf) print ' Doing MOODS both strands for pwm ', thispwmname, ', length=', len(thispwm[0]), datetime.now() onestrandsindexvector = getMOODSscore(fastaf, pwmf, len(thisseq)) print ' bp with no score (given ', NO_SCORE_VALUE, ') is ', (onestrandsindexvector == NO_SCORE_VALUE).sum(), ' expected ', (len(thispwm[0])-1) print ' finding best score per bp, ', datetime.now() bestscorevector = getMaxPWMScore( onestrandsindexvector, len(thispwm[0])), print ' writing wiggle for score per start index.', datetime.now() vegardswritewiggle(onestrandsindexvector[0,], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname) print ' writing wiggle for bestscore. ', datetime.now() vegardswritewiggle(bestscorevector[0], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True): for fastaf in fastafile: ### seqence only needed for length here. MOODS does this parsing again later but without reporting length. thisseqname = fastaf.split('/')[-1].split('.')[0] handle = open(fastaf, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() thisseq = records[0].seq print 'Doing sequence ', thisseqname, 'length=', len(thisseq) for pwmf in pwmfiles: thispwmname = pwmf.split('/')[-1] thispwm = MOODS.load_matrix(pwmf) print ' Doing MOODS both strands for pwm ', thispwmname, ', length=', len( thispwm[0]), datetime.now() onestrandsindexvector = getMOODSscore(fastaf, pwmf, len(thisseq)) print ' bp with no score (given ', NO_SCORE_VALUE, ') is ', ( onestrandsindexvector == NO_SCORE_VALUE).sum(), ' expected ', ( len(thispwm[0]) - 1) print ' finding best score per bp, ', datetime.now() bestscorevector = getMaxPWMScore(onestrandsindexvector, len(thispwm[0])), print ' writing wiggle for score per start index.', datetime.now() vegardswritewiggle(onestrandsindexvector[0, ], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/' + thispwmname) print ' writing wiggle for bestscore. ', datetime.now() vegardswritewiggle(bestscorevector[0], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/' + thispwmname)
def match_single(motif, sequence, genomic_region, unique_threshold=None, normalize_bitscore=True, sort=False): """ Performs motif matching given sequence and the motif.pssm passed as parameter. The genomic_region is needed to evaluate the correct binding position. Please note that the arguments should be passed as a list, to allow for parallelization mapping function. Keyword arguments: motif -- TODO. sequence -- A DNA sequence (string). genomic_region -- A GenomicRegion. output_file -- TODO. unique_threshold -- If this argument is provided, the motif search will be made using a threshold of 0 and then accepting only the motif matches with bitscore/motif_length >= unique_threshold. Return: Print MPBSs to output_file. """ # Establishing threshold if unique_threshold: current_threshold = 0.0 eval_threshold = unique_threshold motif_max = motif.max / motif.len else: current_threshold = motif.threshold eval_threshold = motif.threshold motif_max = motif.max # Performing motif matching try: # old MOODS version results = MOODS.search(sequence, [motif.pssm_list], current_threshold, absolute_threshold=True, both_strands=True) except: # TODO: we can expand this to use bg from sequence, for example, # or from organism. bg = MOODS.tools.flat_bg(4) results = MOODS.scan.scan_dna(sequence, [motif.pssm_list], bg, [current_threshold], 7) grs = GenomicRegionSet("mpbs") for search_result in results: for r in search_result: try: position = r.pos score = r.score except: (position, score) = r # Verifying unique threshold acceptance if unique_threshold and score / motif.len < unique_threshold: continue # If match forward strand if position >= 0: p1 = genomic_region.initial + position strand = "+" # If match reverse strand elif not motif.is_palindrome: p1 = genomic_region.initial - position strand = "-" else: continue # Evaluating p2 p2 = p1 + motif.len # Evaluating score (integer between 0 and 1000 -- needed for bigbed transformation) if normalize_bitscore: # Normalized bitscore = standardize to integer between 0 and 1000 (needed for bigbed transformation) if motif_max > eval_threshold: norm_score = int(((score - eval_threshold) * 1000.0) / (motif_max - eval_threshold)) else: norm_score = 1000 else: # Keep the original bitscore if unique_threshold: norm_score = score / motif.len else: norm_score = score grs.add( GenomicRegion(genomic_region.chrom, int(p1), int(p2), name=motif.name, orientation=strand, data=str(norm_score))) if sort: grs.sort() return grs
####### running MOODS algorithm on sequence with all pwm files. #datetime.now() print 'running getMOODSscore', datetime.now() indexscorematrix = getMOODSscore(fastafile, pwmfiles, both_strands=calculate_both_strands) print 'finished getMOODSscore', datetime.now() #datetime.now() ## for alle pwm. ## lage max array ## skrive ut 3 filer. for n in range(len(pwmfiles)): thisname = pwmfiles[n].split('/')[-1] print 'making maxscpre for ',thisname, datetime.now() thisscorematrix = indexscorematrix[n] # thispwmlength = len(MOODS.load_matrix(pwmfiles[n])[0]) thismaxvector = getMaxPWMScore(thisscorematrix, thispwmlength) print 'writing wiggle for ',thisname, datetime.now() ### best score file vegardswritewiggle(thismaxvector, name=thisname, chr=seqname, path=outputdir + '/' + 'best_score_in_window/'+thisname) for strandnbr in range(len(thisscorematrix)): vegardswritewiggle(thisscorematrix[strandnbr,:], name=thisname, chr=seqname, path=outputdir + '/' + 'start_index_score/'+ thisname+ '/strand_'+str(strandnbr)) temp1 = getMaxPWMScore(temp1, thispwmlength)
#datetime.now() print 'running getMOODSscore', datetime.now() indexscorematrix = getMOODSscore(fastafile, pwmfiles, both_strands=calculate_both_strands) print 'finished getMOODSscore', datetime.now() #datetime.now() ## for alle pwm. ## lage max array ## skrive ut 3 filer. for n in range(len(pwmfiles)): thisname = pwmfiles[n].split('/')[-1] print 'making maxscpre for ', thisname, datetime.now() thisscorematrix = indexscorematrix[n] # thispwmlength = len(MOODS.load_matrix(pwmfiles[n])[0]) thismaxvector = getMaxPWMScore(thisscorematrix, thispwmlength) print 'writing wiggle for ', thisname, datetime.now() ### best score file vegardswritewiggle(thismaxvector, name=thisname, chr=seqname, path=outputdir + '/' + 'best_score_in_window/' + thisname) for strandnbr in range(len(thisscorematrix)): vegardswritewiggle(thisscorematrix[strandnbr, :], name=thisname, chr=seqname, path=outputdir + '/' + 'start_index_score/' + thisname + '/strand_' + str(strandnbr))
def ProcessCLI(args): outputDirectory = '/N/u/jubudka/Mason/BindingFiles/' weightMatrixDirectory = '/N/u/jubudka/Mason/PWMsmall/' sequencesFileName = 'FASTA_All_Merged_Encode.fasta' p_val = 0.0001 print args for i in xrange(len(args)): if args[i] == "-f": sequencesFileName = args[i+1] print "Fasta file is: ", sequencesFileName elif args[i] == "-p": weightMatrixDirectory = args[i+1] print "PWM file is: ", weightMatrixDirectory elif args[i] == "-o": outputDirectory = args[i+1] print "Output file is: ", outputDirectory elif args[i] == "-t": p_val = float(args[i+1]) if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) # file for saving average score stuff # load position weight matrices # order is A C G T sequences = {} seqIDs = [] current_sequence = '' sequencesFile = open(sequencesFileName) aCount = 0 cCount = 0 gCount = 0 tCount = 0 totalLength = 0 for lines in sequencesFile: line = lines.strip() if line == '': continue if (line[0].startswith('>')): seqIDs.append(line[1:]) #add previous sequence to dictionary #create the reverse complement and add to dictionary #perform nucleotide counting #reset sequence to '' for next fasta sequence if (len(current_sequence) > 0): upper_current_sequence = current_sequence.upper() seqID = seqIDs.pop(0) sequences[seqID + ' ' + 'p'] = upper_current_sequence reverseSequence = reverse_complement(upper_current_sequence) sequences[seqID + ' ' + 'm'] = reverseSequence aCount = aCount + upper_current_sequence.count('A') cCount = cCount + upper_current_sequence.count('C') gCount = gCount + upper_current_sequence.count('G') tCount = tCount + upper_current_sequence.count('T') totalLength = totalLength + len(current_sequence) current_sequence = '' else: current_sequence += line upper_current_sequence = current_sequence.upper() seqID = seqIDs.pop(0) sequences[seqID + ' ' + 'p'] = upper_current_sequence reverseSequence = reverse_complement(upper_current_sequence) sequences[seqID + ' ' + 'm'] = reverseSequence aCount = aCount + upper_current_sequence.count('A') cCount = cCount + upper_current_sequence.count('C') gCount = gCount + upper_current_sequence.count('G') tCount = tCount + upper_current_sequence.count('T') totalLength = totalLength + len(current_sequence) aContent = aCount/float(totalLength) cContent = cCount/float(totalLength) gContent = gCount/float(totalLength) tContent = tCount/float(totalLength) backgroundScores = {'A':aContent, 'C':cContent, 'G':gContent, 'T':tContent} bg = [backgroundScores['A'], backgroundScores['C'], backgroundScores['G'], backgroundScores['T']] print bg matrix_names = [filename for filename in os.listdir(weightMatrixDirectory) if filename[-4:] == '.pfm'] pseudocount = 1 matrices = [MOODS.load_matrix(weightMatrixDirectory + filename) for filename in matrix_names] matrices = [MOODS.count_log_odds(matrix, bg, pseudocount) for matrix in matrices] thresholds = [MOODS.threshold_from_p(matrix, bg, p_val) for matrix in matrices] for (matrix, matrix_name, threshold) in zip(matrices, matrix_names, thresholds): motifLength = len(matrix[0]) if motifLength >= 18: matrix_mapper_long(matrix, matrix_name, threshold, outputDirectory, sequences) continue else: matrix_mapper(matrix, matrix_name, threshold, outputDirectory, sequences) print "Finished"
import MOODS matrix = [ [10,0,0], [0,10,0], [0,0,10], [10,10,10]] results = MOODS.search('actgtggcgtcaacgtaggccaacgtggacccgtacgtaaacgaagaggggtagtc', [matrix], 30, absolute_threshold=30) for i in results: for (position, score) in i: print("Position: " + str(position) + " Score: "+ str(score))
import MOODS matrix = [[10, 0, 0], [0, 10, 0], [0, 0, 10], [10, 10, 10]] results = MOODS.search( 'actgtggcgtcaacgtaggccaacgtggacccgtacgtaaacgaagaggggtagtc', [matrix], 30, absolute_threshold=30) for i in results: for (position, score) in i: print("Position: " + str(position) + " Score: " + str(score))
def main(): p = optparse.OptionParser(__doc__) p.add_option('-t', '--thresh', action='store', dest='threshold', default=0.0, help='determines threshold') p.add_option('-a', '--append', action='store', dest='name', default='resultsfor', help='appends pwm name to this when\ creating files') p.add_option('-A', '--absolute', action='store_true', dest='A', default=False, help='absolute threshold') p.add_option('-s', '--standard_background', action='store_true', dest='stdbg') p.add_option('-M', '--specific_Matrix', action='store', dest='specific') options, args = p.parse_args() pwm = open(args[0], 'rU') fa = open(args[1], 'rU') pfa = list(Bio.SeqIO.parse(fa, 'fasta')) index, matricies, sizes = pySeq.parsing.PWMparser.parse(pwm) underorequal20 = [] over20 = [] under20names = [] over20names = [] pwmdata = {} fileout = {} bgt = False if options.stdbg: bgt = [0.25, 0.25, 0.25, 0.25] # Construct Matrices to search and files to write to. for k in index.keys(): if options.specific: if k == options.specific: filename = options.name + k + '.bed' fileout[k] = open(filename, 'w') if sizes[k] <= 20: underorequal20.append(matricies[k]) under20names.append(k) else: over20.append(matricies[k]) over20names.append(k) else: filename = options.name + k + '.bed' fileout[k] = open(filename, 'w') if sizes[k] <= 20: underorequal20.append(matricies[k]) under20names.append(k) else: over20.append(matricies[k]) over20names.append(k) for chrom in pfa: print(chrom.name) #Run under 20s # Should we sort the results as all downstream applications require a # sort first res = MOODS.search(chrom.seq, underorequal20, float(options.threshold), absolute_threshold=options.A, both_strands=True, bg=bgt, algorithm='lf') for n, r in enumerate(res): for position, score in r: start, end, strand = strand_adjust(position, sizes[under20names[n]]) # Add option to round the score values. Defaulting to int atm # since bedToBigBed only accepts integer values.... fileout[under20names[n]].write('\t'.join([ chrom.name, str(start), str(end), under20names[n], str(int(score * 100)), strand, '\n' ])) #Run over 20s res = MOODS.search(chrom.seq, over20, float(options.threshold), absolute_threshold=options.A, both_strands=True, bg=bgt, algorithm='supera') for n, r in enumerate(res): for position, score in r: start, end, strand = strand_adjust(position, sizes[over20names[n]]) fileout[over20names[n]].write('\t'.join([ chrom.name, str(start), str(end), over20names[n], str(int(score * 100)), strand, '\n' ]))
records = fasta.parseFasta(fasta_filepath) seq = records[0][1] matrix1 = [ [0,1,0,0,0,0,0,1,1,0], [1,0,0,0,0,0,0,0,0,0], [0,0,0,0,0,0,0,0,0,0], [0,0,1,1,1,1,1,0,0,1] ] matrix2 = [ [10,0,10,3,5,5], [0,5,0,3,5,0,5], [0,1,0,3,0,5,0], [0,4,0,1,0,0,5] ] results = MOODS.search(seq, [matrix1, matrix2], 0.011) print("Matrix 1 results: "+ str(len(results[0]))) print("Matrix 2 results: "+ str(len(results[1]))) matrices = [matrix1, matrix2] thresholds = [0.011, 0.011] bg = MOODS.bg_from_sequence(seq, 0.1) q = 7 absolute_threshold = False both_strands=False ms = MOODS.MOODSSearch(matrices, thresholds, bg, q, absolute_threshold, both_strands) results = ms.search(seq) print("New Matrix 1 results: "+ str(len(results[0])))
def find_motif_disruptions( position, ref, alt, genome_fasta, matrices, ): """ Determine whether there is a difference between the ref and alt alleles for TF binding. Requires samtools in your path. Parameters ---------- position : str Zero based genomic coordinates of the reference allele of the form chrom:start-end (chr5:100-101 for a SNV for instance). The value end - start should equal the length of the ref allele. ref : str Reference allele. This should match the reference sequence at "position" in genome_fasta. alt : str Alternate allele. genome_fasta : str Path to genome fasta file. This file should be indexed. matrices : dict Dict whose keys are motif names and whose values are pandas data frames or numpy arrays containing PWMs with columns ACGT. Returns ------- out : pandas.DataFrame Pandas data frame with motifs whose best matches that overlapped the variant differed between the reference and alternate sequences. A score of zero and a strand of '' indicates that there was not a match for the motif on the given allele. """ import subprocess import MOODS # import pybedtools as pbt max_motif_length = max([x.shape[0] for x in matrices.values()]) chrom, coords = position.split(':') start,end = [int(x) for x in coords.split('-')] s = '{}:{}-{}'.format(chrom, start - max_motif_length + 1, end + max_motif_length - 1) c = 'samtools faidx {} {}'.format(genome_fasta, s) seq_lines = subprocess.check_output(c, shell=True).strip().split() ref_seq = seq_lines[1] alt_seq = ref_seq[0:max_motif_length - 1] + alt + ref_seq[max_motif_length + len(ref) - 1:] ref_variant_start = max_motif_length - 1 ref_variant_end = max_motif_length - 1 + len(ref) alt_variant_start = max_motif_length - 1 alt_variant_end = max_motif_length - 1 + len(alt) ms = [matrices[x].T.values.tolist() for x in matrices.keys()] ref_res = MOODS.search(ref_seq, ms, 0.001, both_strands=True, bg=[0.25, 0.25, 0.25, 0.25]) ref_res = dict(zip(matrices.keys(), ref_res)) alt_res = MOODS.search(alt_seq, ms, 0.001, both_strands=True, bg=[0.25, 0.25, 0.25, 0.25]) alt_res = dict(zip(matrices.keys(), alt_res)) # First we'll remove any motif matches that don't overlap the variant of interest (and thus # can't be affected by the variant and will be the same for ref and alt). Then we'll get the # best match for each motif for ref and alt. rows = [] for motif in ref_res.keys(): ref_res[motif] = _filter_variant_motif_res(ref_res[motif], ref_variant_start, ref_variant_end, matrices[motif].shape[0], ref_seq) alt_res[motif] = _filter_variant_motif_res(alt_res[motif], alt_variant_start, alt_variant_end, matrices[motif].shape[0], alt_seq) if len(ref_res[motif]) > 0: ref_pos, ref_score = sorted(ref_res[motif], key=lambda x: x[1], reverse=True)[0] ref_strand = {True:'+', False:'-'}[ref_pos > 0] else: ref_score = 0 ref_strand = '' if len(alt_res[motif]) > 0: alt_pos, alt_score = sorted(alt_res[motif], key=lambda x: x[1], reverse=True)[0] alt_strand = {True:'+', False:'-'}[alt_pos > 0] else: alt_score = 0 alt_strand = '' if ref_score > 0 or alt_score > 0: diff = ref_score - alt_score rows.append([motif, ref_score, ref_strand, alt_score, alt_strand, diff]) out = pd.DataFrame(rows, columns=['motif', 'ref_score', 'ref_strand', 'alt_score', 'alt_strand', 'score_diff']) out.index = out.motif out = out.drop('motif', axis=1) out = out[out.score_diff != 0] return out
def setUp(self): # load all the motifs by hand. self.motif_matrices = [ ('zfp4_yrk_3p', [[250, 130, 0, 0, 20, 0, 0, 40, 50], [100, 50, 0, 0, 0, 0, 10, 0, 0], [10, 200, 380, 380, 0, 380, 150, 340, 0], [20, 0, 0, 0, 360, 0, 220, 0, 330]]), ('ttgR', [[7.5, 44.5, 0.0, 0.0, 2.5, 47.5, 8.5], [21.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [3.5000000000000004, 0.0, 0.0, 50.0, 0.0, 0.0, 0.0], [18.0, 5.5, 50.0, 0.0, 47.5, 2.5, 43.0]]), ('cdaR', [[35, 4, 0, 3, 28, 7, 42, 49, 31, 23], [1, 7, 0, 57, 20, 46, 5, 1, 6, 5], [2, 0, 59, 0, 0, 0, 5, 1, 2, 25], [22, 49, 1, 0, 12, 7, 8, 9, 21, 7]]), ('p22_cI', [[80, 113, 0, 0, 0, 468, 581, 396, 35], [68, 20, 581, 0, 0, 35, 0, 10, 0], [160, 0, 0, 0, 0, 0, 0, 10, 0], [273, 448, 0, 581, 581, 78, 0, 165, 546]]), ('rpol_10', [[23, 373, 105, 210, 210, 0], [43, 0, 66, 51, 97, 19], [19, 3, 51, 55, 37, 11], [316, 25, 179, 85, 57, 371]]), ('zfp7_ZP10165', [[50, 0, 0, 50, 0, 0, 0, 0, 50, 0, 50], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 50, 50, 50, 0, 50, 0], [0, 50, 50, 0, 50, 0, 0, 0, 0, 0, 0]]), ('zfp1_efnba2_3p', [[3, 6, 0, 0, 6, 112, 0, 0, 6], [15, 112, 0, 0, 112, 0, 6, 118, 0], [100, 0, 118, 118, 0, 0, 112, 0, 112], [0, 0, 0, 0, 0, 6, 0, 0, 0]]), ('lacI', [[30, 30, 0, 65, 200, 10, 100, 100, 60, 130], [30, 100, 0, 165, 30, 230, 30, 30, 30, 30], [200, 100, 0, 10, 10, 10, 30, 30, 40, 40], [0, 30, 260, 20, 20, 10, 100, 100, 130, 60]]), ('tetR', [[120, 40, 300, 100, 220, 100, 190, 50], [200, 0, 20, 45, 100, 20, 50, 70], [20, 300, 50, 20, 20, 200, 100, 150], [50, 50, 20, 225, 50, 70, 50, 120]]), ('933W_cI', [[985, 548, 1544, 0, 0, 1252, 0, 149], [0, 801, 0, 0, 0, 20, 1680, 0], [11, 33, 184, 1728, 0, 0, 0, 1393], [732, 346, 0, 0, 1728, 456, 48, 186]]), ('zfp6_ZP10363', [[0, 0, 0, 50, 0, 50, 0, 0, 0, 50, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 50, 0, 0, 50, 0, 0, 50, 50, 0, 50], [50, 0, 50, 0, 0, 0, 50, 0, 0, 0, 0]]), ('zfp5_ZN0024', [[50, 0, 50, 0, 0, 0, 50, 0, 0, 50, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50], [0, 50, 0, 50, 0, 50, 0, 50, 50, 0, 0], [0, 0, 0, 0, 50, 0, 0, 0, 0, 0, 0]]), ('434_cI', [[621, 80, 186, 0, 0, 0, 0], [0, 76, 68, 30, 0, 0, 0], [0, 375, 0, 0, 0, 659, 0], [38, 128, 405, 629, 659, 0, 659]]), ('zfp2_dab2_3p', [[8, 178, 108, 0, 70, 158, 0, 150, 0], [0, 0, 0, 0, 0, 0, 0, 20, 8], [170, 0, 50, 178, 108, 20, 178, 0, 170], [0, 0, 20, 0, 0, 0, 0, 8, 0]]), ('acuR', [[0.0, 0.0, 0.0, 0.0, 25.0, 21.0, 17.5, 34.5], [0.0, 0.0, 50.0, 2.0, 7.5, 0.0, 0.0, 0.0], [50.0, 0.0, 0.0, 0.0, 0.0, 11.5, 0.0, 2.0], [0.0, 50.0, 0.0, 48.0, 17.5, 17.5, 32.5, 13.5]]), ('rpol_35', [[42, 18, 18, 173, 142, 134, 116], [0, 0, 6, 135, 140, 50, 90], [0, 72, 244, 0, 40, 72, 82], [359, 311, 133, 93, 79, 145, 113]]), ('zfp8_ZP10457', [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50], [50, 0, 0, 0, 0, 0, 0, 0, 0, 50, 0], [0, 0, 0, 0, 50, 0, 0, 50, 50, 0, 0], [0, 50, 50, 50, 0, 50, 50, 0, 0, 0, 0]]), ('lambda_cI', [[50, 170, 45, 85, 30, 110, 0, 300], [310, 50, 0, 0, 0, 0, 0, 0], [50, 190, 365, 5, 380, 195, 0, 65], [0, 0, 0, 320, 0, 105, 410, 45]]), ('rpol_10_ext', [[0, 0, 100, 23, 373, 105, 210, 210, 0], [60, 0, 100, 43, 0, 66, 51, 97, 19], [140, 240, 100, 19, 3, 51, 55, 37, 11], [200, 160, 100, 316, 25, 179, 85, 57, 371]]) ] # load all thresholds by hand self.thresholds = [ 1.0699545943859903, 0.7897693125951122, 0.6726083744651952, 1.2680761835227123, 1.2225352937342997, 0.9676501362039236, 0.87457893821108, 0.8775565485963774, 0.7705603886602805, 1.4143797568075485, 0.9676501362039236, 0.9676501362039236, 1.2748406244294834, 1.024286840459233, 0.6836163375410198, 1.4530032582649177, 0.9676501362039236, 1.1239104755269196, 2.6755385519992174 ] # generate the moodssearch object. self.moodssearch_obj_fwd = MOODS.MOODSSearch( [x[1] for x in self.motif_matrices], self.thresholds, MOODS.flatbg(), 7, True, False) self.moodssearch_obj_both = MOODS.MOODSSearch( [x[1] for x in self.motif_matrices], self.thresholds, MOODS.flatbg(), 7, True, True) #apFAB46 sequence self.apFAB46_seq = ('AAAAAGACAATGAAAAGCTTAGTCATGGCGCGCCAAAAAGAGTATTG' 'ACTTCGCATCTTTTTGTACCTATAATAGATTCATTGCTA') #These are the proper hits. self.apFAB46_hits = { '434_cI': [(57, 4.688399966083337), (-6, 3.3910215941638624)], '933W_cI': [(-61, 4.046026313566105)], 'acuR': [(54, 2.0456700959741143), (-69, 3.325251483672279), (-34, 2.045670095974115), (-0.0, 7.348975004033191)], 'cdaR': [(29, 3.0419647953759767), (-16, 0.7214691456808224)], 'lacI': [(-57, 1.8194505679247397), (-40, 1.288724504297107)], 'lambda_cI': [(39, 1.9439365463638196)], 'p22_cI': [], 'rpol_10': [(5, 2.0391667799057487), (67, 5.8335347738270835), (72, 3.2805992559471626), (-68, 4.930825591137652), (-54, 1.3544349004462848), (-40, 2.4522945494642174), (-38, 3.1644672601697525), (-15, 3.193650492100148), (-9, 1.7285308122946124)], 'rpol_10_ext': [(-40, 3.263862791624584)], 'rpol_35': [(9, 2.494746699845068), (44, 4.673608724636109), (57, 2.1790487168281887), (58, 2.226020656779173), (60, 2.779856657750668), (61, 1.54573352717996), (75, 1.6937781338634328), (-79, 1.524973502802859), (-74, 2.3203399149336397), (-32, 1.8592143421105267), (-19, 2.55933983358448), (-9, 3.369439254870037), (-8, 3.834192305019384), (-3, 4.054157145090022), (-2, 1.7887495239839581)], 'tetR': [(0, 2.35082455257586), (4, 2.301877909267698), (9, 0.853460927069236), (10, 2.155196197307788), (33, 1.1931970179468807), (34, 3.1110648177773217), (66, 1.533984872075769), (69, 2.8253186216029587), (73, 1.9382219304721406), (-75, 1.129366669586711), (-64, 2.4065472341224545), (-55, 1.1931970179468807), (-54, 2.8883371533478757), (-17, 1.282891279146519)], 'ttgR': [(8, 3.09623693618527), (59, 1.1998476766957555), (-75, 4.694180469193636), (-71, 2.2452537163828197)], 'zfp1_efnba2_3p': [], 'zfp2_dab2_3p': [], 'zfp4_yrk_3p': [], 'zfp5_ZN0024': [], 'zfp6_ZP10363': [], 'zfp7_ZP10165': [], 'zfp8_ZP10457': [] } # hits computed by the moods search object self.raw_hits = self.moodssearch_obj_both.search(self.apFAB46_seq)
def main(): p = optparse.OptionParser(__doc__) p.add_option('-t', '--thresh', action='store', dest='threshold', default=0.0,help='determines threshold') p.add_option('-a', '--append', action='store', dest='name', default='resultsfor', help='appends pwm name to this when\ creating files') p.add_option('-A', '--absolute', action='store_true',dest='A', default=False,help='absolute threshold') p.add_option('-s','--standard_background',action='store_true',dest='stdbg') p.add_option('-M', '--specific_Matrix', action='store', dest='specific') options, args = p.parse_args() pwm = open(args[0], 'rU') fa = open(args[1], 'rU') pfa = list(Bio.SeqIO.parse(fa, 'fasta')) index, matricies, sizes = pySeq.parsing.PWMparser.parse(pwm) underorequal20 = [] over20 = [] under20names = [] over20names = [] pwmdata={} fileout = {} bgt = False if options.stdbg: bgt = [0.25,0.25,0.25,0.25] # Construct Matrices to search and files to write to. for k in index.keys(): if options.specific: if k == options.specific: filename = options.name + k + '.bed' fileout[k] = open(filename, 'w') if sizes[k] <= 20: underorequal20.append(matricies[k]) under20names.append(k) else: over20.append(matricies[k]) over20names.append(k) else: filename = options.name + k + '.bed' fileout[k] = open(filename, 'w') if sizes[k] <= 20: underorequal20.append(matricies[k]) under20names.append(k) else: over20.append(matricies[k]) over20names.append(k) for chrom in pfa: print(chrom.name) #Run under 20s # Should we sort the results as all downstream applications require a # sort first res = MOODS.search(chrom.seq, underorequal20, float(options.threshold), absolute_threshold=options.A , both_strands = True, bg=bgt, algorithm='lf') for n,r in enumerate(res): for position,score in r: start, end, strand = strand_adjust(position, sizes[under20names[n]]) # Add option to round the score values. Defaulting to int atm # since bedToBigBed only accepts integer values.... fileout[under20names[n]].write('\t'.join([chrom.name, str(start), str(end), under20names[n], str(int(score*100)), strand, '\n'])) #Run over 20s res = MOODS.search(chrom.seq, over20, float(options.threshold), absolute_threshold=options.A , both_strands = True, bg=bgt, algorithm='supera') for n,r in enumerate(res): for position,score in r: start, end, strand = strand_adjust(position, sizes[over20names[n]]) fileout[over20names[n]].write('\t'.join([chrom.name, str(start), str(end),over20names[n], str(int(score*100)),strand, '\n']))
import fasta DIST_DIR = abspath(dirname(dirname(LOCAL_DIR))) print(DIST_DIR) fasta_filepath = join(DIST_DIR, "examples/data/sequence/dnaACGT.txt") records = fasta.parseFasta(fasta_filepath) seq = records[0][1] matrix1 = [[0, 1, 0, 0, 0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 0, 0, 1]] matrix2 = [[10, 0, 10, 3, 5, 5], [0, 5, 0, 3, 5, 0, 5], [0, 1, 0, 3, 0, 5, 0], [0, 4, 0, 1, 0, 0, 5]] results = MOODS.search(seq, [matrix1, matrix2], 0.011) print("Matrix 1 results: " + str(len(results[0]))) print("Matrix 2 results: " + str(len(results[1]))) matrices = [matrix1, matrix2] thresholds = [0.011, 0.011] bg = MOODS.bg_from_sequence(seq, 0.1) q = 7 absolute_threshold = False both_strands = False ms = MOODS.MOODSSearch(matrices, thresholds, bg, q, absolute_threshold, both_strands) results = ms.search(seq) print("New Matrix 1 results: " + str(len(results[0])))
def count(enhlist, c, p_val): write_match = False index = 0 seq = "" while index <= len(enhlist) - 1 and not enhlist[index].startswith('>'): seq += enhlist[index] seq = seq.replace('\n','') index += 1 index = 0 results = [] pseudocount = 0.001 # starrmot15 # caca = [[49,0,100,0,91,0,100,0],[1,83,0,88,0,49,0,94],[49,1,0,12,0,9,0,6],[0,16,0,0,9,41,0,0,]] caca = [[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0],[0,0,0,0,0,0]] me137 = [[0,1,0,1,0,1],[0,0,0,0,0,0],[1,0,1,0,1,0],[0,0,0,0,0,0]] gcgc = [[0,0,0,0,0,0],[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0]] tata = [[0,1,0,1,0,1],[0,0,0,0,0,0],[0,0,0,0,0,0],[1,0,1,0,1,0]] matrices = [me137,caca,gcgc,tata] if (c == "default"): results = MOODS.search(seq,matrices,p_val,both_strands=True) elif (c == "human"): BG_hum =[0.29508855202553025, 0.20466109233964447, 0.20478482916547036, 0.2954655264693549] matrices = [MOODS.count_log_odds(matrix,BG_hum,pseudocount) for matrix in matrices] thresholds = [MOODS.threshold_from_p(matrix, BG_hum, p_val) for matrix in matrices] results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True) elif (c == "equal"): BG_eq = [0.25,0.25,0.25,0.25] matrices = [MOODS.count_log_odds(matrix,BG_eq,pseudocount) for matrix in matrices] thresholds = [MOODS.threshold_from_p(matrix, BG_eq, p_val) for matrix in matrices] results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True) else: write_match = False ga = re.compile(r'GAGA(?:GA)+',re.IGNORECASE) ct = re.compile(r'TCTC(?:TC)+',re.IGNORECASE) ca = re.compile(r'CACA(?:CA)+',re.IGNORECASE) gt = re.compile(r'TGTG(?:TG)+',re.IGNORECASE) gc = re.compile(r'GCGC(?:GC)+',re.IGNORECASE) cg = re.compile(r'CGCG(?:CG)+',re.IGNORECASE) ta = re.compile(r'TATA(?:TA)+',re.IGNORECASE) at = re.compile(r'ATAT(?:AT)+',re.IGNORECASE) ga_occur = re.findall(ga,seq) + re.findall(ct,seq) ca_occur = re.findall(ca,seq) + re.findall(gt,seq) gc_occur = re.findall(gc,seq) + re.findall(cg,seq) ta_occur = re.findall(ta,seq) + re.findall(at,seq) results = [ga_occur, ca_occur, gc_occur, ta_occur] # write files containing the matches for each motif as found by MOODS # only if boolean set at top of count() is True if write_match: ga = open('%sga_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') ca = open('%sca_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') gc = open('%sgc_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') ta = open('%sta_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') comp = False for match in results[0]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(me137[1])): st += seq[pos+i] if comp: st = complement(st) comp = False ga.write('%s\n' % st) for match in results[1]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(caca[1])): st += seq[pos+i] if comp: st = complement(st) comp = False ca.write('%s\n' % st) for match in results[2]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(gcgc[1])): st += seq[pos+i] if comp: st = complement(st) comp = False gc.write('%s\n' % st) for match in results[3]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(tata[1])): st += seq[pos+i] if comp: st = complement(st) comp = False ta.write('%s\n' % st) ga.close() ca.close() gc.close() ta.close() results.append(len(seq)) return results
def find_motif_disruptions( position, ref, alt, genome_fasta, matrices, ): """ Determine whether there is a difference between the ref and alt alleles for TF binding. Requires samtools in your path. Parameters ---------- position : str Zero based genomic coordinates of the reference allele of the form chrom:start-end (chr5:100-101 for a SNV for instance). The value end - start should equal the length of the ref allele. ref : str Reference allele. This should match the reference sequence at "position" in genome_fasta. alt : str Alternate allele. genome_fasta : str Path to genome fasta file. This file should be indexed. matrices : dict Dict whose keys are motif names and whose values are pandas data frames or numpy arrays containing PWMs with columns ACGT. Returns ------- out : pandas.DataFrame Pandas data frame with motifs whose best matches that overlapped the variant differed between the reference and alternate sequences. A score of zero and a strand of '' indicates that there was not a match for the motif on the given allele. """ import subprocess import MOODS # import pybedtools as pbt max_motif_length = max([x.shape[0] for x in matrices.values()]) chrom, coords = position.split(':') start, end = [int(x) for x in coords.split('-')] s = '{}:{}-{}'.format(chrom, start - max_motif_length + 1, end + max_motif_length - 1) c = 'samtools faidx {} {}'.format(genome_fasta, s) seq_lines = subprocess.check_output(c, shell=True).strip().split() ref_seq = seq_lines[1] alt_seq = ref_seq[0:max_motif_length - 1] + alt + ref_seq[max_motif_length + len(ref) - 1:] ref_variant_start = max_motif_length - 1 ref_variant_end = max_motif_length - 1 + len(ref) alt_variant_start = max_motif_length - 1 alt_variant_end = max_motif_length - 1 + len(alt) ms = [matrices[x].T.values.tolist() for x in matrices.keys()] ref_res = MOODS.search(ref_seq, ms, 0.001, both_strands=True, bg=[0.25, 0.25, 0.25, 0.25]) ref_res = dict(zip(matrices.keys(), ref_res)) alt_res = MOODS.search(alt_seq, ms, 0.001, both_strands=True, bg=[0.25, 0.25, 0.25, 0.25]) alt_res = dict(zip(matrices.keys(), alt_res)) # First we'll remove any motif matches that don't overlap the variant of interest (and thus # can't be affected by the variant and will be the same for ref and alt). Then we'll get the # best match for each motif for ref and alt. rows = [] for motif in ref_res.keys(): ref_res[motif] = _filter_variant_motif_res(ref_res[motif], ref_variant_start, ref_variant_end, matrices[motif].shape[0], ref_seq) alt_res[motif] = _filter_variant_motif_res(alt_res[motif], alt_variant_start, alt_variant_end, matrices[motif].shape[0], alt_seq) if len(ref_res[motif]) > 0: ref_pos, ref_score = sorted(ref_res[motif], key=lambda x: x[1], reverse=True)[0] ref_strand = {True: '+', False: '-'}[ref_pos > 0] else: ref_score = 0 ref_strand = '' if len(alt_res[motif]) > 0: alt_pos, alt_score = sorted(alt_res[motif], key=lambda x: x[1], reverse=True)[0] alt_strand = {True: '+', False: '-'}[alt_pos > 0] else: alt_score = 0 alt_strand = '' if ref_score > 0 or alt_score > 0: diff = ref_score - alt_score rows.append( [motif, ref_score, ref_strand, alt_score, alt_strand, diff]) out = pd.DataFrame(rows, columns=[ 'motif', 'ref_score', 'ref_strand', 'alt_score', 'alt_strand', 'score_diff' ]) out.index = out.motif out = out.drop('motif', axis=1) out = out[out.score_diff != 0] return out
def setUp(self): # load all the motifs by hand. self.motif_matrices = [ ('zfp4_yrk_3p', [[250, 130, 0, 0, 20, 0, 0, 40, 50], [100, 50, 0, 0, 0, 0, 10, 0, 0], [10, 200, 380, 380, 0, 380, 150, 340, 0], [20, 0, 0, 0, 360, 0, 220, 0, 330]]), ('ttgR', [[7.5, 44.5, 0.0, 0.0, 2.5, 47.5, 8.5], [21.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [3.5000000000000004, 0.0, 0.0, 50.0, 0.0, 0.0, 0.0], [18.0, 5.5, 50.0, 0.0, 47.5, 2.5, 43.0]]), ('cdaR', [[35, 4, 0, 3, 28, 7, 42, 49, 31, 23], [1, 7, 0, 57, 20, 46, 5, 1, 6, 5], [2, 0, 59, 0, 0, 0, 5, 1, 2, 25], [22, 49, 1, 0, 12, 7, 8, 9, 21, 7]]), ('p22_cI', [[80, 113, 0, 0, 0, 468, 581, 396, 35], [68, 20, 581, 0, 0, 35, 0, 10, 0], [160, 0, 0, 0, 0, 0, 0, 10, 0], [273, 448, 0, 581, 581, 78, 0, 165, 546]]), ('rpol_10', [[23, 373, 105, 210, 210, 0], [43, 0, 66, 51, 97, 19], [19, 3, 51, 55, 37, 11], [316, 25, 179, 85, 57, 371]]), ('zfp7_ZP10165', [[50, 0, 0, 50, 0, 0, 0, 0, 50, 0, 50], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 50, 50, 50, 0, 50, 0], [0, 50, 50, 0, 50, 0, 0, 0, 0, 0, 0]]), ('zfp1_efnba2_3p', [[3, 6, 0, 0, 6, 112, 0, 0, 6], [15, 112, 0, 0, 112, 0, 6, 118, 0], [100, 0, 118, 118, 0, 0, 112, 0, 112], [0, 0, 0, 0, 0, 6, 0, 0, 0]]), ('lacI', [[30, 30, 0, 65, 200, 10, 100, 100, 60, 130], [30, 100, 0, 165, 30, 230, 30, 30, 30, 30], [200, 100, 0, 10, 10, 10, 30, 30, 40, 40], [0, 30, 260, 20, 20, 10, 100, 100, 130, 60]]), ('tetR', [[120, 40, 300, 100, 220, 100, 190, 50], [200, 0, 20, 45, 100, 20, 50, 70], [20, 300, 50, 20, 20, 200, 100, 150], [50, 50, 20, 225, 50, 70, 50, 120]]), ('933W_cI', [[985, 548, 1544, 0, 0, 1252, 0, 149], [0, 801, 0, 0, 0, 20, 1680, 0], [11, 33, 184, 1728, 0, 0, 0, 1393], [732, 346, 0, 0, 1728, 456, 48, 186]]), ('zfp6_ZP10363', [[0, 0, 0, 50, 0, 50, 0, 0, 0, 50, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 50, 0, 0, 50, 0, 0, 50, 50, 0, 50], [50, 0, 50, 0, 0, 0, 50, 0, 0, 0, 0]]), ('zfp5_ZN0024', [[50, 0, 50, 0, 0, 0, 50, 0, 0, 50, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50], [0, 50, 0, 50, 0, 50, 0, 50, 50, 0, 0], [0, 0, 0, 0, 50, 0, 0, 0, 0, 0, 0]]), ('434_cI', [[621, 80, 186, 0, 0, 0, 0], [0, 76, 68, 30, 0, 0, 0], [0, 375, 0, 0, 0, 659, 0], [38, 128, 405, 629, 659, 0, 659]]), ('zfp2_dab2_3p', [[8, 178, 108, 0, 70, 158, 0, 150, 0], [0, 0, 0, 0, 0, 0, 0, 20, 8], [170, 0, 50, 178, 108, 20, 178, 0, 170], [0, 0, 20, 0, 0, 0, 0, 8, 0]]), ('acuR', [[0.0, 0.0, 0.0, 0.0, 25.0, 21.0, 17.5, 34.5], [0.0, 0.0, 50.0, 2.0, 7.5, 0.0, 0.0, 0.0], [50.0, 0.0, 0.0, 0.0, 0.0, 11.5, 0.0, 2.0], [0.0, 50.0, 0.0, 48.0, 17.5, 17.5, 32.5, 13.5]]), ('rpol_35', [[42, 18, 18, 173, 142, 134, 116], [0, 0, 6, 135, 140, 50, 90], [0, 72, 244, 0, 40, 72, 82], [359, 311, 133, 93, 79, 145, 113]]), ('zfp8_ZP10457', [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50], [50, 0, 0, 0, 0, 0, 0, 0, 0, 50, 0], [0, 0, 0, 0, 50, 0, 0, 50, 50, 0, 0], [0, 50, 50, 50, 0, 50, 50, 0, 0, 0, 0]]), ('lambda_cI', [[50, 170, 45, 85, 30, 110, 0, 300], [310, 50, 0, 0, 0, 0, 0, 0], [50, 190, 365, 5, 380, 195, 0, 65], [0, 0, 0, 320, 0, 105, 410, 45]]), ('rpol_10_ext', [[0, 0, 100, 23, 373, 105, 210, 210, 0], [60, 0, 100, 43, 0, 66, 51, 97, 19], [140, 240, 100, 19, 3, 51, 55, 37, 11], [200, 160, 100, 316, 25, 179, 85, 57, 371]]) ] # load all thresholds by hand self.thresholds = [ 1.0699545943859903, 0.7897693125951122, 0.6726083744651952, 1.2680761835227123, 1.2225352937342997, 0.9676501362039236, 0.87457893821108, 0.8775565485963774, 0.7705603886602805, 1.4143797568075485, 0.9676501362039236, 0.9676501362039236, 1.2748406244294834, 1.024286840459233, 0.6836163375410198, 1.4530032582649177, 0.9676501362039236, 1.1239104755269196, 2.6755385519992174 ] # generate the moodssearch object. self.moodssearch_obj_fwd = MOODS.MOODSSearch( [x[1] for x in self.motif_matrices], self.thresholds, MOODS.flatbg(), 7, True, False) self.moodssearch_obj_both = MOODS.MOODSSearch( [x[1] for x in self.motif_matrices], self.thresholds, MOODS.flatbg(), 7, True, True) #apFAB46 sequence self.apFAB46_seq = ('AAAAAGACAATGAAAAGCTTAGTCATGGCGCGCCAAAAAGAGTATTG' 'ACTTCGCATCTTTTTGTACCTATAATAGATTCATTGCTA') #These are the proper hits. self.apFAB46_hits = { '434_cI': [(57, 4.688399966083337), (-6, 3.3910215941638624)], '933W_cI': [(-61, 4.046026313566105)], 'acuR': [(54, 2.0456700959741143), (-69, 3.325251483672279), (-34, 2.045670095974115), (-0.0, 7.348975004033191)], 'cdaR': [(29, 3.0419647953759767), (-16, 0.7214691456808224)], 'lacI': [(-57, 1.8194505679247397), (-40, 1.288724504297107)], 'lambda_cI': [(39, 1.9439365463638196)], 'p22_cI': [], 'rpol_10': [(5, 2.0391667799057487), (67, 5.8335347738270835), (72, 3.2805992559471626), (-68, 4.930825591137652), (-54, 1.3544349004462848), (-40, 2.4522945494642174), (-38, 3.1644672601697525), (-15, 3.193650492100148), (-9, 1.7285308122946124)], 'rpol_10_ext': [(-40, 3.263862791624584)], 'rpol_35': [(9, 2.494746699845068), (44, 4.673608724636109), (57, 2.1790487168281887), (58, 2.226020656779173), (60, 2.779856657750668), (61, 1.54573352717996), (75, 1.6937781338634328), (-79, 1.524973502802859), (-74, 2.3203399149336397), (-32, 1.8592143421105267), (-19, 2.55933983358448), (-9, 3.369439254870037), (-8, 3.834192305019384), (-3, 4.054157145090022), (-2, 1.7887495239839581)], 'tetR': [(0, 2.35082455257586), (4, 2.301877909267698), (9, 0.853460927069236), (10, 2.155196197307788), (33, 1.1931970179468807), (34, 3.1110648177773217), (66, 1.533984872075769), (69, 2.8253186216029587), (73, 1.9382219304721406), (-75, 1.129366669586711), (-64, 2.4065472341224545), (-55, 1.1931970179468807), (-54, 2.8883371533478757), (-17, 1.282891279146519)], 'ttgR': [(8, 3.09623693618527), (59, 1.1998476766957555), (-75, 4.694180469193636), (-71, 2.2452537163828197)], 'zfp1_efnba2_3p': [], 'zfp2_dab2_3p': [], 'zfp4_yrk_3p': [], 'zfp5_ZN0024': [], 'zfp6_ZP10363': [], 'zfp7_ZP10165': [], 'zfp8_ZP10457': []} # hits computed by the moods search object self.raw_hits = self.moodssearch_obj_both.search(self.apFAB46_seq)
def main(): """ The main loop. Lets ROCK! """ desc = """... ask me later! I'm on a deadline! ...""" parser = argparse.ArgumentParser(description=desc) parser.add_argument('--seqs', type=str, help="""Path to a fasta file containing the 'promoter' regions of a single species you wish to scan with motifs.""") parser.add_argument('--species', type=str, help="""A quoted string of the species name: 'Anophles gambiae'.""") parser.add_argument('--motifs', type=str, help="""Path to a file containing the motifs you wish to use. The file must be in JASPAR's 'matrix_only.txt' format.""") parser.add_argument('--thresh', type=float, required=False, default=0.001, help="""A p-val cut-off above which hits will be ignored. (default = %(default)s)""") parser.add_argument('--out', type=str, required=False, default='compare_motifs.out', help="""Path to outfile. (default = %(default)s)""") parser.add_argument('--norm', type=str, required=False, default=False, help="""Optional path to outfile for data normalized by upper quartiles w.r.t. each motif. (default = %(default)s)""") parser.add_argument('--to-norm', type=str, required=False, default=False, help="""Optional path to outfile of previous run that needs to be normalized. (default = %(default)s)""") args = parser.parse_args() if not args.to_norm: # create parsers motifs = ParseJasparMatrixOnly(args.motifs) seqs = ParseFastA(args.seqs) # Load all motifs at once # We will be loading one seq at a time. motifs = motifs.to_dict() # set up output and headers headers = 'seq_name\tspecies\t%s\n' % ('\t'.join(motifs.keys())) out_file = open(args.out,'w') out_file.write(headers) # lets start the major looping if args.norm and not args.to_norm: norm_dict = OrderedDict() elif args.to_norm: norm_dict,headers = load_to_normalize(args.to_norm) write_normalized_table(headers,args.norm,norm_dict) exit(0) for name,seq in seqs: hits = MOODS.search(seq,motifs.values(),args.thresh) counts = [len(x) for x in hits] out_file.write('%s\t%s\t%s\n' % (name,args.species,'\t'.join([str(x) for x in counts]))) if args.norm: norm_dict['%s\t%s' % (name,args.species)] = np.array(counts) out_file.close() if args.norm: write_normalized_table(headers,args.norm,norm_dict)
[0,0,0,1]] teststring = 'acgtacgt' ''' handle = open('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.fasta', "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() seq = records[0] teststring=seq.seq print('both strands') results = MOODS.search(teststring, [matrix], thresholds=1, absolute_threshold=False, both_strands=True) for i in results: for (position, score) in i: print("Position: " + str(position) + " Score: "+ str(score)) print('one way') results = MOODS.search(teststring, [matrix], thresholds=1, absolute_threshold=False, both_strands=False) for i in results: for (position, score) in i: print("Position: " + str(position) + " Score: "+ str(score))
import MOODS import Bio.Seq import Bio.SeqIO handle = open("data/sequence/dnaACGT.txt", "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() seq = records[0] matrix1 = [ [0,1,0,0,0,0,0,1,1,0], [1,0,0,0,0,0,0,0,0,0], [0,0,0,0,0,0,0,0,0,0], [0,0,1,1,1,1,1,0,0,1] ] matrix2 = [ [10,0,10,3,5,5], [0,5,0,3,5,0,5], [0,1,0,3,0,5,0], [0,4,0,1,0,0,5] ] results = MOODS.search(seq.seq, [matrix1, matrix2], 0.011) print("Matrix 1 results: "+ str(len(results[0]))) print("Matrix 2 results: "+ str(len(results[1])))
import MOODS import Bio.Seq import Bio.SeqIO handle = open("data/sequence/dnaACGT.txt", "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() seq = records[0] matrix1 = [[0, 1, 0, 0, 0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 0, 0, 1]] matrix2 = [[10, 0, 10, 3, 5, 5], [0, 5, 0, 3, 5, 0, 5], [0, 1, 0, 3, 0, 5, 0], [0, 4, 0, 1, 0, 0, 5]] results = MOODS.search(seq.seq, [matrix1, matrix2], 0.011) print("Matrix 1 results: " + str(len(results[0]))) print("Matrix 2 results: " + str(len(results[1])))