def bitScoreMM(pwmFileName, genomeDict, mpbsDict, scoringMethod, tempLocation, pseudocounts=0.1, bitscore=12.0, fpr=0.01, precision=10**4, highCutoff=0.7, functionalDepth=0.9): """Performs basic motif matching algorithm and writes the results to a dictionary indexed by chromosome. Keyword arguments: pwmFileName -- PWM file name. genomeDict -- Genome dictionary. mpbsDict -- Dictionary of MPBSs to insert the results. scoringMethod -- Method to evaluate which MPBSs are enriched. tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting. pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1) bitscore -- The cutoff bitscore value. (default 12.0) fpr -- False positive rate to determine the cutoff value. (default 0.01) precision -- Motif score distribution precision. (default 10**4) highCutoff -- High cutoff for Boyle's rule. (default 0.7) functionalDepth -- Functional depth for Boyle's rule. (default 0.9) Returns: mpbsDict -- This method inserts entries on the mpbsDict. """ # Reading PWM pwm = createPwmDict(pwmFileName, pseudocounts) pwmName = pwmFileName.split("/")[-1].split(".")[0] pwmLen = len(pwm["A"]) background = math.log(0.25, 2) * pwmLen # Evaluating threshold pwmThreshold = 0.0 if (scoringMethod == "bitscore"): pwmThreshold = bitscore elif (scoringMethod == "fpr"): bioPwm = biopythonMM.readPwmFile(pwmFileName, tempLocation, pseudocounts) sd = Motif.ScoreDistribution(bioPwm, precision=precision) pwmThreshold = sd.threshold_fpr(fpr) elif (scoringMethod == "boyle"): maxScore = 0.0 minScore = 0.0 # TODO Boyle's rule is not suited for negative values. for i in range(0, pwmLen): maxScore += max(pwm["A"][i], pwm["C"][i], pwm["G"][i], pwm["T"][i]) maxScore -= background pwmThreshold = min(highCutoff * maxScore, functionalDepth * (maxScore - minScore)) else: sys.stderr.write("Choose a valid scoring method.\n") sys.exit(0) # Creating aditional parameters chrList = constants.getChromList(reference=[mpbsDict]) tempMpbsDict = dict([(e, []) for e in chrList]) maxValue = -99.0 revDict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")]) # Iterating on chromosomes for chrName in chrList: # Reading genome sequence = genomeDict[chrName].upper() # Performing motif matching for pos in xrange(0, len(sequence) - pwmLen + 1): scoreF = -background scoreR = -background for i in range(0, pwmLen): scoreF += pwm[sequence[pos + i]][i] scoreR += pwm[revDict[sequence[pos + pwmLen - i - 1]]][i] if (scoreF > pwmThreshold): if (scoreF > maxValue): maxValue = scoreF tempMpbsDict[chrName].append( [pos, pos + pwmLen, pwmName, scoreF, "+"]) if (scoreR > pwmThreshold): if (scoreR > maxValue): maxValue = scoreR tempMpbsDict[chrName].append( [pos, pos + pwmLen, pwmName, scoreR, "-"]) # Update scores - new scores are within [0,1000] for chrName in chrList: for e in tempMpbsDict[chrName]: mpbsDict[chrName].append([ e[0], e[1], e[2], int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)), e[4] ]) return 0
def biopythonMM(pwmFileName, genomeDict, mpbsDict, scoringMethod, tempLocation, pseudocounts=0.1, bitscore=12.0, fpr=0.01, precision=10**4, highCutoff=0.7, functionalDepth=0.9): """Performs Biopython based motif matching and writes the results to a dictionary indexed by chromosome. Keyword arguments: pwmFileName -- PWM file name. genomeDict -- Genome dictionary. mpbsDict -- Dictionary of MPBSs to insert the results. scoringMethod -- Method to evaluate which MPBSs are enriched. tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting. pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1) bitscore -- The cutoff bitscore value. (default 12.0) fpr -- False positive rate to determine the cutoff value. (default 0.01) precision -- Motif score distribution precision. (default 10**4) highCutoff -- High cutoff for Boyle's rule. (default 0.7) functionalDepth -- Functional depth for Boyle's rule. (default 0.9) Returns: mpbsDict -- This method inserts entries on the mpbsDict. """ # Reading PWM pwm = readPwmFile(pwmFileName, tempLocation, pseudocounts) pwmName = pwmFileName.split("/")[-1].split(".")[0] pwmLen = len(pwm) # Evaluating threshold pwmThreshold = 0.0 if (scoringMethod == "bitscore"): pwmThreshold = bitscore elif (scoringMethod == "fpr"): sd = Motif.ScoreDistribution(pwm, precision=precision) pwmThreshold = sd.threshold_fpr(fpr) elif (scoringMethod == "boyle"): maxScore = pwm.max_score() minScore = 0.0 # TODO Boyle's rule is not suited for negative values. pwmThreshold = min(highCutoff * maxScore, functionalDepth * (maxScore - minScore)) else: sys.stderr.write("Choose a valid scoring method.\n") sys.exit(0) # Creating aditional parameters chrList = constants.getChromList(reference=[mpbsDict]) tempMpbsDict = dict([(e, []) for e in chrList]) maxValue = -99.0 # Iterating on chromosomes for chrName in chrList: # Reading genome sequence = genomeDict[chrName] # Performing biopython's motif matching for pos, score in pwm.search_pwm(sequence, threshold=pwmThreshold): if (score > maxValue): maxValue = score if (pos >= 0): tempMpbsDict[chrName].append( [pos, pos + pwmLen, pwmName, score, "+"]) else: tempMpbsDict[chrName].append( [-pos, -pos + pwmLen, pwmName, score, "-"]) # Update scores - new scores are within [0,1000] for chrName in chrList: for e in tempMpbsDict[chrName]: mpbsDict[chrName].append([ e[0], e[1], e[2], int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)), e[4] ]) return 0
def motifMatchingBiopython(combinationList,pwmList,coordDict,pwmLocation,genomeList,tempLocation,fpr=0.01,pseudocounts=0.0,precision=10**4,color="black"): """Performs Biopython based motif matching and returns a list containing the matches and writes the results on bed files. Keyword arguments: combinationList -- List of the number of cobinding combinations. pwmList -- List of PWMs where each entry represents the name of a PWM file. coordDict -- Dictionary of coordinates where the motif matching will be applied. pwmLocation -- Path containing the motif pwm files. genomeList -- List of fasta files containing the sequences to perform the motif matching, where the headers are the chromosomes. tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting. fpr -- False positive rate to determine the cutoff value. (default 0.01) pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.0) precision -- Motif score distribution precision. (default 10**4) color -- Color of the bed entries. Can be 'green', 'red' or 'black'. (default 'black') Returns: mpbsDict -- Dictionary (for each PWM) of dictionaries (for each chromosome) of motif predicted binding sites. statDict -- Dictionary of statistics for Fisher test concerning the number of motifs inside enriched regions. geneDict -- Dictionary of genes (position NAME in bed file) that contains each motif. """ # Reading PWM pwmDict = dict() for pwmName in pwmList: pwmDict[pwmName] = readPwmFile(pwmLocation+pwmName+".pwm","/".join(tempLocation.split("/")[:-1])+"/",pseudocounts) # Evaluating thresholds pwmThresholdDict = dict() for pwmName in pwmList: sd = Motif.ScoreDistribution(pwmDict[pwmName],precision=precision) pwmThresholdDict[pwmName] = sd.threshold_fpr(fpr) # Reading genome genomeDict = genome.readFastaFiles(genomeList) # Creating chromosome list chrList = constants.getChromList(reference=[coordDict]) # Removing chrX, chrY and chrM # TODO Stop removing these chromosomes #chrListT = [] #for e in chrList: # if(e not in ["chrX", "chrY", "chrM"]): chrListT.append(e) #chrList = chrListT # Evaluating bed additionals if(color == "green"): color = "0,130,0" elif(color == "red"): color = "130,0,0" elif(color == "black"): color = "0,0,0" # Create combinations dictionary keys combKeys = [] for c in combinationList: for b in [",".join(e) for e in itertools.combinations(pwmList,c)]: combKeys.append(b) # Iterating on chromosomes mpbsDict = dict([(e,dict()) for e in pwmDict.keys()]) statDict = dict([(e,[0,0]) for e in combKeys]) # Left is evidence / Right is not evidence geneDict = dict([(e,[]) for e in combKeys]) maxDict = dict([(e,-99.0) for e in pwmDict.keys()]) ct=0 for chrName in chrList: # Reading genome if(chrName not in genomeDict.keys()): continue sequence = genomeDict[chrName] # Iterating on coordinate dictionary for e in mpbsDict.keys(): mpbsDict[e][chrName] = [] for coord in coordDict[chrName]: ct=ct+1 #print "region", ct # Getting current sequence based on coordinates currSeq = sequence[coord[0]:coord[1]] # Keeping track of the factors found in this coordinate flagMotifs = dict([(e,False) for e in pwmDict.keys()]) # Iterating on PWMs for pwmName in pwmDict.keys(): pwmLen = len(pwmDict[pwmName]) for pos, score in pwmDict[pwmName].search_pwm(currSeq,threshold=pwmThresholdDict[pwmName]): if(score > maxDict[pwmName]): maxDict[pwmName] = score if(pos >= 0): mpbsDict[pwmName][chrName].append([pos+coord[0],pos+coord[0]+pwmLen,pwmName,score,"+",pos+coord[0],pos+coord[0]+pwmLen,color]) else: mpbsDict[pwmName][chrName].append([-pos+coord[0],-pos+coord[0]+pwmLen,pwmName,score,"-",-pos+coord[0],-pos+coord[0]+pwmLen,color]) flagMotifs[pwmName] = True # Updating statistic counts and genes motifsFoundList = [k for k in pwmList if flagMotifs[k]] motifsFoundKeys = [] motifsNotFoundKeys = [e for e in combKeys] for c in combinationList: for b in [",".join(e) for e in itertools.combinations(motifsFoundList,c)]: motifsFoundKeys.append(b) motifsNotFoundKeys.remove(b) for k in motifsFoundKeys: statDict[k][0] += 1 for e in coord[2].split(":"): geneDict[k].append(e) for k in motifsNotFoundKeys: statDict[k][1] += 1 # Update scores - new scores are within [0,1000] for pwmName in pwmDict.keys(): for chrName in mpbsDict[pwmName].keys(): for e in mpbsDict[pwmName][chrName]: e[3] = int(1000*(e[3]-pwmThresholdDict[pwmName])/(maxDict[pwmName]-pwmThresholdDict[pwmName])) # Remove repetitive genes from geneList for k in geneDict.keys(): geneDict[k] = list(set(geneDict[k])) return mpbsDict, statDict, geneDict
def fimoMM(pwmFileName, genomeFile, mpbsDict, scoringMethod, tempLocation, pseudocounts=0.1, bitscore=12.0, fpr=0.01, precision=10**4, highCutoff=0.7, functionalDepth=0.9, threshold=0.0001): """Performs FIMO motif matching algorithm and writes the results to a dictionary indexed by chromosome. Keyword arguments: pwmFileName -- PWM file name. genomeFile -- Fasta file containing the regions to be analyzed mpbsDict -- Dictionary of MPBSs to insert the results. scoringMethod -- Method to evaluate which MPBSs are enriched. tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting. pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1) bitscore -- The cutoff bitscore value. (default 12.0) fpr -- False positive rate to determine the cutoff value. (default 0.01) precision -- Motif score distribution precision. (default 10**4) highCutoff -- High cutoff for Boyle's rule. (default 0.7) functionalDepth -- Functional depth for Boyle's rule. (default 0.9) threshold -- The cutoff threshold value. (default 0.0001) Returns: mpbsDict -- This method inserts entries on the mpbsDict. """ # Converting jaspar to MEME memeFileName = jasparToMeme(pwmFileName, tempLocation, pseudocounts) tempPath = "/".join(memeFileName.split("/")[:-1]) + "/" fimoFileName = tempPath + "results.txt" errorOutputName = tempPath + "error.txt" # Evaluating threshold pwmThreshold = 0.0 if (scoringMethod == "bitscore"): pwmThreshold = bitscore threshold = 0.1 elif (scoringMethod == "fpr"): bioPwm = biopythonMM.readPwmFile(pwmFileName, tempLocation, pseudocounts) sd = Motif.ScoreDistribution(bioPwm, precision=precision) pwmThreshold = sd.threshold_fpr(fpr) threshold = 0.1 print bioPwm.max_score() elif (scoringMethod == "boyle"): maxScore = 0.0 minScore = 0.0 # TODO Boyle's rule is not suited for negative values. pwmBoyle = bitScoreMM.createPwmDict(pwmFileName, pseudocounts) pwmLen = len(pwmBoyle["A"]) for i in range(0, pwmLen): maxScore += max(pwmBoyle["A"][i], pwmBoyle["C"][i], pwmBoyle["G"][i], pwmBoyle["T"][i]) background = math.log(0.25, 2) * pwmLen maxScore -= background pwmThreshold = min(highCutoff * maxScore, functionalDepth * (maxScore - minScore)) threshold = 0.1 elif (scoringMethod == "fimo"): pass else: sys.stderr.write("Choose a valid scoring method.\n") sys.exit(0) # Performing FIMO os.system( "fimo --text --verbosity 1 --max-stored-scores 1000000 --output-pthresh " + str(threshold) + " " + memeFileName + " " + genomeFile + " > " + fimoFileName + " 2> " + errorOutputName) # Reading FIMO output tempMpbsDict = dict() fimoFile = open(fimoFileName, "r") fimoFile.readline() maxValue = -999 for line in fimoFile: ll = line.strip().split("\t") ll = [ll[0][0], ll[0][1:]] + ll[1:] if (scoringMethod != "fimo" and float(ll[5]) < pwmThreshold): continue if (float(ll[5]) > maxValue): maxValue = float(ll[5]) if (ll[2] in tempMpbsDict.keys()): if (ll[0] == "+"): tempMpbsDict[ll[2]].append( [int(ll[3]) - 1, int(ll[4]), ll[1], float(ll[5]), ll[0]]) else: tempMpbsDict[ll[2]].append( [int(ll[4]) - 1, int(ll[3]), ll[1], float(ll[5]), ll[0]]) else: if (ll[0] == "+"): tempMpbsDict[ll[2]] = [[ int(ll[3]) - 1, int(ll[4]), ll[1], float(ll[5]), ll[0] ]] else: tempMpbsDict[ll[2]] = [[ int(ll[4]) - 1, int(ll[3]), ll[1], float(ll[5]), ll[0] ]] fimoFile.close() # Update scores and remove MPBSs with score below pwmThreshold (if it is being used) for chrName in tempMpbsDict.keys(): for e in tempMpbsDict[chrName]: if (chrName in mpbsDict.keys()): mpbsDict[chrName].append([ e[0], e[1], e[2], int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)), e[4] ]) else: mpbsDict[chrName] = [[ e[0], e[1], e[2], int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)), e[4] ]] # Removing temporary PWM folder os.system("rm -rf " + "/".join(memeFileName.split("/")[:-1])) return 0