def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True): for fastaf in fastafile: thisseqname = fastaf.split('/')[-1].split('.')[0] handle = open(fastaf, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() thisseq = records[0].seq print 'Doing sequence ', thisseqname, 'length=', len(thisseq) for pwmf in pwmfiles: thispwmname = pwmf.split('/')[-1] print ' Doing pwm ', thispwmname thispwm = MOODS.load_matrix(pwmf) thispwmcomplement = MOODS.reverse_complement(thispwm) print ' strand 1' onestrandindexvector=getMOODSscore(thisseq, thispwm) print ' strand 2' otherstrandindexvecor=getMOODSscore(thisseq, thispwmcomplement) print ' finding best score per bp' bothstrandsindexvector = np.append( onestrandindexvector, otherstrandindexvecor, axis=0) bestscorevector = getMaxPWMScore( bothstrandsindexvector, len(thispwm[0])) for strandnbr in range(len(bothstrandsindexvector)): print ' writing wiggle for strand', str(strandnbr) vegardswritewiggle(bothstrandsindexvector[strandnbr,:], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname+ '/strand_'+str(strandnbr)) print ' writing wiggle for bestscore' vegardswritewiggle(bestscorevector, name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
def getMOODSscore(seqfile, pwmfiles, both_strands=False): handle = open(seqfile, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() seq = records[0].seq print 'len(seq)=', len(seq) matrixlist = list() for f in pwmfiles: matrix = MOODS.load_matrix(f) print 'pwm ', f, 'windowlength=', len(matrix[0]) matrixlist.append(matrix) if both_strands: matrixlist.append( MOODS.reverse_complement(matrix) ) # both_strand option in MOODS returned a akward result. print 'starting MOODS.search', datetime.now() results = MOODS.search(seq, matrixlist, thresholds=1, absolute_threshold=False) print 'done MOODS.search', datetime.now() reslist = [] for n in range(len(pwmfiles)): thisind = n * (1 + both_strands) reslist.append(vegardparseMOODSres(results[thisind], len(seq))) if both_strands: reslist[n] = np.append(reslist[n], vegardparseMOODSres(results[thisind + 1], len(seq)), axis=0) return (reslist)
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True): for fastaf in fastafile: ### seqence only needed for length here. MOODS does this parsing again later but without reporting length. thisseqname = fastaf.split('/')[-1].split('.')[0] handle = open(fastaf, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() thisseq = records[0].seq print 'Doing sequence ', thisseqname, 'length=', len(thisseq) for pwmf in pwmfiles: thispwmname = pwmf.split('/')[-1] thispwm = MOODS.load_matrix(pwmf) print ' Doing MOODS both strands for pwm ', thispwmname, ', length=', len( thispwm[0]), datetime.now() onestrandsindexvector = getMOODSscore(fastaf, pwmf, len(thisseq)) print ' bp with no score (given ', NO_SCORE_VALUE, ') is ', ( onestrandsindexvector == NO_SCORE_VALUE).sum(), ' expected ', ( len(thispwm[0]) - 1) print ' finding best score per bp, ', datetime.now() bestscorevector = getMaxPWMScore(onestrandsindexvector, len(thispwm[0])), print ' writing wiggle for score per start index.', datetime.now() vegardswritewiggle(onestrandsindexvector[0, ], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/' + thispwmname) print ' writing wiggle for bestscore. ', datetime.now() vegardswritewiggle(bestscorevector[0], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/' + thispwmname)
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True): for fastaf in fastafile: ### seqence only needed for length here. MOODS does this parsing again later but without reporting length. thisseqname = fastaf.split('/')[-1].split('.')[0] handle = open(fastaf, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() thisseq = records[0].seq print 'Doing sequence ', thisseqname, 'length=', len(thisseq) for pwmf in pwmfiles: thispwmname = pwmf.split('/')[-1] thispwm = MOODS.load_matrix(pwmf) print ' Doing MOODS both strands for pwm ', thispwmname, ', length=', len(thispwm[0]), datetime.now() onestrandsindexvector = getMOODSscore(fastaf, pwmf, len(thisseq)) print ' bp with no score (given ', NO_SCORE_VALUE, ') is ', (onestrandsindexvector == NO_SCORE_VALUE).sum(), ' expected ', (len(thispwm[0])-1) print ' finding best score per bp, ', datetime.now() bestscorevector = getMaxPWMScore( onestrandsindexvector, len(thispwm[0])), print ' writing wiggle for score per start index.', datetime.now() vegardswritewiggle(onestrandsindexvector[0,], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname) print ' writing wiggle for bestscore. ', datetime.now() vegardswritewiggle(bestscorevector[0], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
def getMOODSscore(seqfile, pwmfiles, both_strands=False): handle = open(seqfile, "r") records = list(Bio.SeqIO.parse(handle, "fasta")) handle.close() seq = records[0].seq print 'len(seq)=',len(seq) matrixlist=list() for f in pwmfiles: matrix = MOODS.load_matrix(f) print 'pwm ', f , 'windowlength=', len(matrix[0]) matrixlist.append(matrix) if both_strands: matrixlist.append(MOODS.reverse_complement(matrix)) # both_strand option in MOODS returned a akward result. print 'starting MOODS.search', datetime.now() results = MOODS.search(seq, matrixlist, thresholds=1, absolute_threshold=False) print 'done MOODS.search', datetime.now() reslist=[] for n in range(len(pwmfiles)): thisind = n * (1 + both_strands) reslist.append(vegardparseMOODSres( results[thisind] , len(seq))) if both_strands: reslist[n] = np.append( reslist[n] , vegardparseMOODSres( results[thisind+1] , len(seq)), axis=0) return(reslist)
#datetime.now() print 'running getMOODSscore', datetime.now() indexscorematrix = getMOODSscore(fastafile, pwmfiles, both_strands=calculate_both_strands) print 'finished getMOODSscore', datetime.now() #datetime.now() ## for alle pwm. ## lage max array ## skrive ut 3 filer. for n in range(len(pwmfiles)): thisname = pwmfiles[n].split('/')[-1] print 'making maxscpre for ', thisname, datetime.now() thisscorematrix = indexscorematrix[n] # thispwmlength = len(MOODS.load_matrix(pwmfiles[n])[0]) thismaxvector = getMaxPWMScore(thisscorematrix, thispwmlength) print 'writing wiggle for ', thisname, datetime.now() ### best score file vegardswritewiggle(thismaxvector, name=thisname, chr=seqname, path=outputdir + '/' + 'best_score_in_window/' + thisname) for strandnbr in range(len(thisscorematrix)): vegardswritewiggle(thisscorematrix[strandnbr, :], name=thisname, chr=seqname, path=outputdir + '/' + 'start_index_score/' + thisname + '/strand_' + str(strandnbr))
def ProcessCLI(args): outputDirectory = '/N/u/jubudka/Mason/BindingFiles/' weightMatrixDirectory = '/N/u/jubudka/Mason/PWMsmall/' sequencesFileName = 'FASTA_All_Merged_Encode.fasta' p_val = 0.0001 print args for i in xrange(len(args)): if args[i] == "-f": sequencesFileName = args[i+1] print "Fasta file is: ", sequencesFileName elif args[i] == "-p": weightMatrixDirectory = args[i+1] print "PWM file is: ", weightMatrixDirectory elif args[i] == "-o": outputDirectory = args[i+1] print "Output file is: ", outputDirectory elif args[i] == "-t": p_val = float(args[i+1]) if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) # file for saving average score stuff # load position weight matrices # order is A C G T sequences = {} seqIDs = [] current_sequence = '' sequencesFile = open(sequencesFileName) aCount = 0 cCount = 0 gCount = 0 tCount = 0 totalLength = 0 for lines in sequencesFile: line = lines.strip() if line == '': continue if (line[0].startswith('>')): seqIDs.append(line[1:]) #add previous sequence to dictionary #create the reverse complement and add to dictionary #perform nucleotide counting #reset sequence to '' for next fasta sequence if (len(current_sequence) > 0): upper_current_sequence = current_sequence.upper() seqID = seqIDs.pop(0) sequences[seqID + ' ' + 'p'] = upper_current_sequence reverseSequence = reverse_complement(upper_current_sequence) sequences[seqID + ' ' + 'm'] = reverseSequence aCount = aCount + upper_current_sequence.count('A') cCount = cCount + upper_current_sequence.count('C') gCount = gCount + upper_current_sequence.count('G') tCount = tCount + upper_current_sequence.count('T') totalLength = totalLength + len(current_sequence) current_sequence = '' else: current_sequence += line upper_current_sequence = current_sequence.upper() seqID = seqIDs.pop(0) sequences[seqID + ' ' + 'p'] = upper_current_sequence reverseSequence = reverse_complement(upper_current_sequence) sequences[seqID + ' ' + 'm'] = reverseSequence aCount = aCount + upper_current_sequence.count('A') cCount = cCount + upper_current_sequence.count('C') gCount = gCount + upper_current_sequence.count('G') tCount = tCount + upper_current_sequence.count('T') totalLength = totalLength + len(current_sequence) aContent = aCount/float(totalLength) cContent = cCount/float(totalLength) gContent = gCount/float(totalLength) tContent = tCount/float(totalLength) backgroundScores = {'A':aContent, 'C':cContent, 'G':gContent, 'T':tContent} bg = [backgroundScores['A'], backgroundScores['C'], backgroundScores['G'], backgroundScores['T']] print bg matrix_names = [filename for filename in os.listdir(weightMatrixDirectory) if filename[-4:] == '.pfm'] pseudocount = 1 matrices = [MOODS.load_matrix(weightMatrixDirectory + filename) for filename in matrix_names] matrices = [MOODS.count_log_odds(matrix, bg, pseudocount) for matrix in matrices] thresholds = [MOODS.threshold_from_p(matrix, bg, p_val) for matrix in matrices] for (matrix, matrix_name, threshold) in zip(matrices, matrix_names, thresholds): motifLength = len(matrix[0]) if motifLength >= 18: matrix_mapper_long(matrix, matrix_name, threshold, outputDirectory, sequences) continue else: matrix_mapper(matrix, matrix_name, threshold, outputDirectory, sequences) print "Finished"
####### running MOODS algorithm on sequence with all pwm files. #datetime.now() print 'running getMOODSscore', datetime.now() indexscorematrix = getMOODSscore(fastafile, pwmfiles, both_strands=calculate_both_strands) print 'finished getMOODSscore', datetime.now() #datetime.now() ## for alle pwm. ## lage max array ## skrive ut 3 filer. for n in range(len(pwmfiles)): thisname = pwmfiles[n].split('/')[-1] print 'making maxscpre for ',thisname, datetime.now() thisscorematrix = indexscorematrix[n] # thispwmlength = len(MOODS.load_matrix(pwmfiles[n])[0]) thismaxvector = getMaxPWMScore(thisscorematrix, thispwmlength) print 'writing wiggle for ',thisname, datetime.now() ### best score file vegardswritewiggle(thismaxvector, name=thisname, chr=seqname, path=outputdir + '/' + 'best_score_in_window/'+thisname) for strandnbr in range(len(thisscorematrix)): vegardswritewiggle(thisscorematrix[strandnbr,:], name=thisname, chr=seqname, path=outputdir + '/' + 'start_index_score/'+ thisname+ '/strand_'+str(strandnbr)) temp1 = getMaxPWMScore(temp1, thispwmlength)