Ejemplo n.º 1
0
def bitScoreMM(pwmFileName,
               genomeDict,
               mpbsDict,
               scoringMethod,
               tempLocation,
               pseudocounts=0.1,
               bitscore=12.0,
               fpr=0.01,
               precision=10**4,
               highCutoff=0.7,
               functionalDepth=0.9):
    """Performs basic motif matching algorithm and writes the results to a dictionary indexed by chromosome.

    Keyword arguments:
    pwmFileName -- PWM file name.
    genomeDict -- Genome dictionary.
    mpbsDict -- Dictionary of MPBSs to insert the results.
    scoringMethod -- Method to evaluate which MPBSs are enriched.
    tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting.
    pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1)
    bitscore -- The cutoff bitscore value. (default 12.0)
    fpr -- False positive rate to determine the cutoff value. (default 0.01)
    precision -- Motif score distribution precision. (default 10**4)
    highCutoff -- High cutoff for Boyle's rule. (default 0.7)
    functionalDepth -- Functional depth for Boyle's rule. (default 0.9)

    Returns:
    mpbsDict -- This method inserts entries on the mpbsDict.
    """

    # Reading PWM
    pwm = createPwmDict(pwmFileName, pseudocounts)
    pwmName = pwmFileName.split("/")[-1].split(".")[0]
    pwmLen = len(pwm["A"])
    background = math.log(0.25, 2) * pwmLen

    # Evaluating threshold
    pwmThreshold = 0.0
    if (scoringMethod == "bitscore"):
        pwmThreshold = bitscore
    elif (scoringMethod == "fpr"):
        bioPwm = biopythonMM.readPwmFile(pwmFileName, tempLocation,
                                         pseudocounts)
        sd = Motif.ScoreDistribution(bioPwm, precision=precision)
        pwmThreshold = sd.threshold_fpr(fpr)
    elif (scoringMethod == "boyle"):
        maxScore = 0.0
        minScore = 0.0  # TODO Boyle's rule is not suited for negative values.
        for i in range(0, pwmLen):
            maxScore += max(pwm["A"][i], pwm["C"][i], pwm["G"][i], pwm["T"][i])
        maxScore -= background
        pwmThreshold = min(highCutoff * maxScore,
                           functionalDepth * (maxScore - minScore))
    else:
        sys.stderr.write("Choose a valid scoring method.\n")
        sys.exit(0)

    # Creating aditional parameters
    chrList = constants.getChromList(reference=[mpbsDict])
    tempMpbsDict = dict([(e, []) for e in chrList])
    maxValue = -99.0
    revDict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"),
                    ("N", "N")])

    # Iterating on chromosomes
    for chrName in chrList:

        # Reading genome
        sequence = genomeDict[chrName].upper()

        # Performing motif matching
        for pos in xrange(0, len(sequence) - pwmLen + 1):
            scoreF = -background
            scoreR = -background
            for i in range(0, pwmLen):
                scoreF += pwm[sequence[pos + i]][i]
                scoreR += pwm[revDict[sequence[pos + pwmLen - i - 1]]][i]
            if (scoreF > pwmThreshold):
                if (scoreF > maxValue): maxValue = scoreF
                tempMpbsDict[chrName].append(
                    [pos, pos + pwmLen, pwmName, scoreF, "+"])
            if (scoreR > pwmThreshold):
                if (scoreR > maxValue): maxValue = scoreR
                tempMpbsDict[chrName].append(
                    [pos, pos + pwmLen, pwmName, scoreR, "-"])

    # Update scores - new scores are within [0,1000]
    for chrName in chrList:
        for e in tempMpbsDict[chrName]:
            mpbsDict[chrName].append([
                e[0], e[1], e[2],
                int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)),
                e[4]
            ])

    return 0
Ejemplo n.º 2
0
def biopythonMM(pwmFileName,
                genomeDict,
                mpbsDict,
                scoringMethod,
                tempLocation,
                pseudocounts=0.1,
                bitscore=12.0,
                fpr=0.01,
                precision=10**4,
                highCutoff=0.7,
                functionalDepth=0.9):
    """Performs Biopython based motif matching and writes the results to a dictionary indexed by chromosome.

    Keyword arguments:
    pwmFileName -- PWM file name.
    genomeDict -- Genome dictionary.
    mpbsDict -- Dictionary of MPBSs to insert the results.
    scoringMethod -- Method to evaluate which MPBSs are enriched.
    tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting.
    pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1)
    bitscore -- The cutoff bitscore value. (default 12.0)
    fpr -- False positive rate to determine the cutoff value. (default 0.01)
    precision -- Motif score distribution precision. (default 10**4)
    highCutoff -- High cutoff for Boyle's rule. (default 0.7)
    functionalDepth -- Functional depth for Boyle's rule. (default 0.9)

    Returns:
    mpbsDict -- This method inserts entries on the mpbsDict.
    """

    # Reading PWM
    pwm = readPwmFile(pwmFileName, tempLocation, pseudocounts)
    pwmName = pwmFileName.split("/")[-1].split(".")[0]
    pwmLen = len(pwm)

    # Evaluating threshold
    pwmThreshold = 0.0
    if (scoringMethod == "bitscore"):
        pwmThreshold = bitscore
    elif (scoringMethod == "fpr"):
        sd = Motif.ScoreDistribution(pwm, precision=precision)
        pwmThreshold = sd.threshold_fpr(fpr)
    elif (scoringMethod == "boyle"):
        maxScore = pwm.max_score()
        minScore = 0.0  # TODO Boyle's rule is not suited for negative values.
        pwmThreshold = min(highCutoff * maxScore,
                           functionalDepth * (maxScore - minScore))
    else:
        sys.stderr.write("Choose a valid scoring method.\n")
        sys.exit(0)

    # Creating aditional parameters
    chrList = constants.getChromList(reference=[mpbsDict])
    tempMpbsDict = dict([(e, []) for e in chrList])
    maxValue = -99.0

    # Iterating on chromosomes
    for chrName in chrList:

        # Reading genome
        sequence = genomeDict[chrName]

        # Performing biopython's motif matching
        for pos, score in pwm.search_pwm(sequence, threshold=pwmThreshold):
            if (score > maxValue): maxValue = score
            if (pos >= 0):
                tempMpbsDict[chrName].append(
                    [pos, pos + pwmLen, pwmName, score, "+"])
            else:
                tempMpbsDict[chrName].append(
                    [-pos, -pos + pwmLen, pwmName, score, "-"])

    # Update scores - new scores are within [0,1000]
    for chrName in chrList:
        for e in tempMpbsDict[chrName]:
            mpbsDict[chrName].append([
                e[0], e[1], e[2],
                int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)),
                e[4]
            ])

    return 0
Ejemplo n.º 3
0
def motifMatchingBiopython(combinationList,pwmList,coordDict,pwmLocation,genomeList,tempLocation,fpr=0.01,pseudocounts=0.0,precision=10**4,color="black"):
    """Performs Biopython based motif matching and returns a list containing the matches and
       writes the results on bed files.

    Keyword arguments:
    combinationList -- List of the number of cobinding combinations.
    pwmList -- List of PWMs where each entry represents the name of a PWM file.
    coordDict -- Dictionary of coordinates where the motif matching will be applied.
    pwmLocation -- Path containing the motif pwm files.
    genomeList -- List of fasta files containing the sequences to perform the motif matching, where the headers are the chromosomes.
    tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting.
    fpr -- False positive rate to determine the cutoff value. (default 0.01)
    pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.0)
    precision -- Motif score distribution precision. (default 10**4)
    color -- Color of the bed entries. Can be 'green', 'red' or 'black'. (default 'black')

    Returns:
    mpbsDict -- Dictionary (for each PWM) of dictionaries (for each chromosome) of motif predicted binding sites.
    statDict -- Dictionary of statistics for Fisher test concerning the number of motifs inside enriched regions.
    geneDict -- Dictionary of genes (position NAME in bed file) that contains each motif.
    """
    
    # Reading PWM
    pwmDict = dict()
    for pwmName in pwmList: pwmDict[pwmName] = readPwmFile(pwmLocation+pwmName+".pwm","/".join(tempLocation.split("/")[:-1])+"/",pseudocounts)

    # Evaluating thresholds
    pwmThresholdDict = dict()
    for pwmName in pwmList:
        sd = Motif.ScoreDistribution(pwmDict[pwmName],precision=precision)
        pwmThresholdDict[pwmName] = sd.threshold_fpr(fpr)

    # Reading genome
    genomeDict = genome.readFastaFiles(genomeList)

    # Creating chromosome list
    chrList = constants.getChromList(reference=[coordDict])
    # Removing chrX, chrY and chrM
    # TODO Stop removing these chromosomes
    #chrListT = []
    #for e in chrList:
    #    if(e not in ["chrX", "chrY", "chrM"]): chrListT.append(e)
    #chrList = chrListT

    # Evaluating bed additionals
    if(color == "green"): color = "0,130,0"
    elif(color == "red"): color = "130,0,0"
    elif(color == "black"): color = "0,0,0"

    # Create combinations dictionary keys
    combKeys = []
    for c in combinationList:
        for b in [",".join(e) for e in itertools.combinations(pwmList,c)]: combKeys.append(b)

    # Iterating on chromosomes
    mpbsDict = dict([(e,dict()) for e in pwmDict.keys()])
    statDict = dict([(e,[0,0]) for e in combKeys]) # Left is evidence / Right is not evidence
    geneDict = dict([(e,[]) for e in combKeys])
    maxDict = dict([(e,-99.0) for e in pwmDict.keys()])
    ct=0
    for chrName in chrList:

        # Reading genome
        if(chrName not in genomeDict.keys()): continue
        sequence = genomeDict[chrName]

        # Iterating on coordinate dictionary
        for e in mpbsDict.keys(): mpbsDict[e][chrName] = []
        for coord in coordDict[chrName]:
            ct=ct+1
            #print "region", ct
            # Getting current sequence based on coordinates
            currSeq = sequence[coord[0]:coord[1]]

            # Keeping track of the factors found in this coordinate
            flagMotifs = dict([(e,False) for e in pwmDict.keys()])

            # Iterating on PWMs
            for pwmName in pwmDict.keys():
                pwmLen = len(pwmDict[pwmName])
                for pos, score in pwmDict[pwmName].search_pwm(currSeq,threshold=pwmThresholdDict[pwmName]):
                    if(score > maxDict[pwmName]): maxDict[pwmName] = score
                    if(pos >= 0): mpbsDict[pwmName][chrName].append([pos+coord[0],pos+coord[0]+pwmLen,pwmName,score,"+",pos+coord[0],pos+coord[0]+pwmLen,color])
                    else: mpbsDict[pwmName][chrName].append([-pos+coord[0],-pos+coord[0]+pwmLen,pwmName,score,"-",-pos+coord[0],-pos+coord[0]+pwmLen,color])
                    flagMotifs[pwmName] = True
            
            # Updating statistic counts and genes
            motifsFoundList = [k for k in pwmList if flagMotifs[k]]
            motifsFoundKeys = []
            motifsNotFoundKeys = [e for e in combKeys]
            for c in combinationList:
                for b in [",".join(e) for e in itertools.combinations(motifsFoundList,c)]:
                    motifsFoundKeys.append(b)
                    motifsNotFoundKeys.remove(b)
            for k in motifsFoundKeys:
                statDict[k][0] += 1
                for e in coord[2].split(":"): geneDict[k].append(e)
            for k in motifsNotFoundKeys:
                statDict[k][1] += 1

    # Update scores - new scores are within [0,1000]
    for pwmName in pwmDict.keys():
        for chrName in mpbsDict[pwmName].keys():
            for e in mpbsDict[pwmName][chrName]:
                e[3] = int(1000*(e[3]-pwmThresholdDict[pwmName])/(maxDict[pwmName]-pwmThresholdDict[pwmName]))

    # Remove repetitive genes from geneList
    for k in geneDict.keys(): geneDict[k] = list(set(geneDict[k]))
    
    return mpbsDict, statDict, geneDict
Ejemplo n.º 4
0
def fimoMM(pwmFileName,
           genomeFile,
           mpbsDict,
           scoringMethod,
           tempLocation,
           pseudocounts=0.1,
           bitscore=12.0,
           fpr=0.01,
           precision=10**4,
           highCutoff=0.7,
           functionalDepth=0.9,
           threshold=0.0001):
    """Performs FIMO motif matching algorithm and writes the results to a dictionary indexed by chromosome.

    Keyword arguments:
    pwmFileName -- PWM file name.
    genomeFile -- Fasta file containing the regions to be analyzed
    mpbsDict -- Dictionary of MPBSs to insert the results.
    scoringMethod -- Method to evaluate which MPBSs are enriched.
    tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting.
    pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1)
    bitscore -- The cutoff bitscore value. (default 12.0)
    fpr -- False positive rate to determine the cutoff value. (default 0.01)
    precision -- Motif score distribution precision. (default 10**4)
    highCutoff -- High cutoff for Boyle's rule. (default 0.7)
    functionalDepth -- Functional depth for Boyle's rule. (default 0.9)
    threshold -- The cutoff threshold value. (default 0.0001)

    Returns:
    mpbsDict -- This method inserts entries on the mpbsDict.
    """

    # Converting jaspar to MEME
    memeFileName = jasparToMeme(pwmFileName, tempLocation, pseudocounts)
    tempPath = "/".join(memeFileName.split("/")[:-1]) + "/"
    fimoFileName = tempPath + "results.txt"
    errorOutputName = tempPath + "error.txt"

    # Evaluating threshold
    pwmThreshold = 0.0
    if (scoringMethod == "bitscore"):
        pwmThreshold = bitscore
        threshold = 0.1
    elif (scoringMethod == "fpr"):
        bioPwm = biopythonMM.readPwmFile(pwmFileName, tempLocation,
                                         pseudocounts)
        sd = Motif.ScoreDistribution(bioPwm, precision=precision)
        pwmThreshold = sd.threshold_fpr(fpr)
        threshold = 0.1
        print bioPwm.max_score()
    elif (scoringMethod == "boyle"):
        maxScore = 0.0
        minScore = 0.0  # TODO Boyle's rule is not suited for negative values.
        pwmBoyle = bitScoreMM.createPwmDict(pwmFileName, pseudocounts)
        pwmLen = len(pwmBoyle["A"])
        for i in range(0, pwmLen):
            maxScore += max(pwmBoyle["A"][i], pwmBoyle["C"][i],
                            pwmBoyle["G"][i], pwmBoyle["T"][i])
        background = math.log(0.25, 2) * pwmLen
        maxScore -= background
        pwmThreshold = min(highCutoff * maxScore,
                           functionalDepth * (maxScore - minScore))
        threshold = 0.1
    elif (scoringMethod == "fimo"):
        pass
    else:
        sys.stderr.write("Choose a valid scoring method.\n")
        sys.exit(0)

    # Performing FIMO
    os.system(
        "fimo --text --verbosity 1 --max-stored-scores 1000000 --output-pthresh "
        + str(threshold) + " " + memeFileName + " " + genomeFile + " > " +
        fimoFileName + " 2> " + errorOutputName)

    # Reading FIMO output
    tempMpbsDict = dict()
    fimoFile = open(fimoFileName, "r")
    fimoFile.readline()
    maxValue = -999
    for line in fimoFile:
        ll = line.strip().split("\t")
        ll = [ll[0][0], ll[0][1:]] + ll[1:]
        if (scoringMethod != "fimo" and float(ll[5]) < pwmThreshold): continue
        if (float(ll[5]) > maxValue): maxValue = float(ll[5])
        if (ll[2] in tempMpbsDict.keys()):
            if (ll[0] == "+"):
                tempMpbsDict[ll[2]].append(
                    [int(ll[3]) - 1,
                     int(ll[4]), ll[1],
                     float(ll[5]), ll[0]])
            else:
                tempMpbsDict[ll[2]].append(
                    [int(ll[4]) - 1,
                     int(ll[3]), ll[1],
                     float(ll[5]), ll[0]])
        else:
            if (ll[0] == "+"):
                tempMpbsDict[ll[2]] = [[
                    int(ll[3]) - 1,
                    int(ll[4]), ll[1],
                    float(ll[5]), ll[0]
                ]]
            else:
                tempMpbsDict[ll[2]] = [[
                    int(ll[4]) - 1,
                    int(ll[3]), ll[1],
                    float(ll[5]), ll[0]
                ]]
    fimoFile.close()

    # Update scores and remove MPBSs with score below pwmThreshold (if it is being used)
    for chrName in tempMpbsDict.keys():
        for e in tempMpbsDict[chrName]:
            if (chrName in mpbsDict.keys()):
                mpbsDict[chrName].append([
                    e[0], e[1], e[2],
                    int(1000 * (e[3] - pwmThreshold) /
                        (maxValue - pwmThreshold)), e[4]
                ])
            else:
                mpbsDict[chrName] = [[
                    e[0], e[1], e[2],
                    int(1000 * (e[3] - pwmThreshold) /
                        (maxValue - pwmThreshold)), e[4]
                ]]

    # Removing temporary PWM folder
    os.system("rm -rf " + "/".join(memeFileName.split("/")[:-1]))

    return 0