Esempio n. 1
0
def conservation_pvalue(nmer, IDs, fsaDict, ConsDict, num_alignments):
    width = len(nmer)
    total = []
    unconserved = []
    thetas = []
    for i in range(num_alignments-1):
        total.append(0)
        unconserved.append(0)
        tot_positions = 0
        tot_unconserved = 0
        for ID in IDs:
            try:
                cons = ConsDict[ID]
            except:
                continue
            if (cons[i]==[]): continue
            try:
                tot_positions = tot_positions + len(cons[i])
                tot_unconserved = tot_unconserved + cons[i].count(1)
            except:
                print "cons: %s"%cons
        try:
            thetas.append(float(tot_unconserved)/tot_positions)
        except:
            thetas.append(1.0)
    #print thetas
    for ID in IDs:
        seq = fsaDict[ID]
        seqrc = ConvergeMotifTools.revcomplement(seq)
        try:
            cons = ConsDict[ID]
        except:
            continue
        if (cons==[]):
            continue
        hits = []
        hitsr = []
        if (type(nmer)==type(ConvergeMotifTools.Motif())):
            hits = cluster.matches_old(nmer, seq, 0.7)
            #print len(hits)
        else:
            site_re = re.compile(AmbigToRegExp(nmer))
            hit = site_re.search(seq)
            hitr = site_re.search(seqrc)
            while (hit!=None):
                hits.append(hit.start())
                hit = site_re.search(seq,hit.end())
            while (hitr!=None):
                if (hits.count(len(seq)-hitr.end()-1)==0):
                    hits.append(len(seq) - hitr.end()-1)
                hitr = site_re.search(seqrc,hitr.end())
        for hit in hits:
            for i in range(width):
                for j in range(num_alignments-1):
                    if (cons[j]!=[]):
                        total[j] = total[j] + 1
                        unconserved[j] = unconserved[j] + cons[j][hit+i]
    mean = 0
    var = 0
    uc = 0
    for i in range(num_alignments-1):
        m = total[i]*thetas[i]
        #nq = total[i]*(1-thetas[i])
        #if (min(nq,m)<5): return(0.5)
        mean = mean + m
        var = var + total[i]*thetas[i]*(1-thetas[i])
        uc = uc + unconserved[i]
    stdev = math.sqrt(var)
    if (stdev>0):
        Z = ((uc + 0.5)-mean)/stdev
    else:
        Z = 0
    if (Z>=0):
        p = Arith.lzprob(Z)
    else:
        p = 1.0 - Arith.lzprob(-Z)
    return(p)
Esempio n. 2
0
def conservation_pvalue(nmer, IDs, fsaDict, ConsDict, num_alignments):
    width = len(nmer)
    total = []
    unconserved = []
    thetas = []
    for i in range(num_alignments - 1):
        total.append(0)
        unconserved.append(0)
        tot_positions = 0
        tot_unconserved = 0
        for ID in IDs:
            try:
                cons = ConsDict[ID]
            except:
                continue
            if (cons[i] == []): continue
            try:
                tot_positions = tot_positions + len(cons[i])
                tot_unconserved = tot_unconserved + cons[i].count(1)
            except:
                print "cons: %s" % cons
        try:
            thetas.append(float(tot_unconserved) / tot_positions)
        except:
            thetas.append(1.0)
    #print thetas
    for ID in IDs:
        seq = fsaDict[ID]
        seqrc = ConvergeMotifTools.revcomplement(seq)
        try:
            cons = ConsDict[ID]
        except:
            continue
        if (cons == []):
            continue
        hits = []
        hitsr = []
        if (type(nmer) == type(ConvergeMotifTools.Motif())):
            hits = cluster.matches_old(nmer, seq, 0.7)
            #print len(hits)
        else:
            site_re = re.compile(AmbigToRegExp(nmer))
            hit = site_re.search(seq)
            hitr = site_re.search(seqrc)
            while (hit != None):
                hits.append(hit.start())
                hit = site_re.search(seq, hit.end())
            while (hitr != None):
                if (hits.count(len(seq) - hitr.end() - 1) == 0):
                    hits.append(len(seq) - hitr.end() - 1)
                hitr = site_re.search(seqrc, hitr.end())
        for hit in hits:
            for i in range(width):
                for j in range(num_alignments - 1):
                    if (cons[j] != []):
                        total[j] = total[j] + 1
                        unconserved[j] = unconserved[j] + cons[j][hit + i]
    mean = 0
    var = 0
    uc = 0
    for i in range(num_alignments - 1):
        m = total[i] * thetas[i]
        #nq = total[i]*(1-thetas[i])
        #if (min(nq,m)<5): return(0.5)
        mean = mean + m
        var = var + total[i] * thetas[i] * (1 - thetas[i])
        uc = uc + unconserved[i]
    stdev = math.sqrt(var)
    if (stdev > 0):
        Z = ((uc + 0.5) - mean) / stdev
    else:
        Z = 0
    if (Z >= 0):
        p = Arith.lzprob(Z)
    else:
        p = 1.0 - Arith.lzprob(-Z)
    return (p)
Esempio n. 3
0
def WMWtest(A, B):
    """
    WMWtest(A,B) -- Computes the Wilcoxon-Mann-Whitney nonparametric W statistic for two distributions

    input:  list of numbers, list of numbers
    output: p-value, W-statistic
    """
    A.sort()
    B.sort()
    TotalList = A + B
    TotalList.sort()

    nA = len(A)
    nB = len(B)
    N = nA + nB
    MaxSum = N * (N + 1) / 2.0
    H0 = MaxSum / 2.0

    ## Replace values by ranks
    previous = []
    start = 0
    Total_rank = TotalList[:]
    for i in range(len(TotalList)):
        if (TotalList[i] == previous):
            mean_rank = (start + i + 2) / 2.0
            for j in range(start, i + 1):
                Total_rank[j] = mean_rank
        else:
            Total_rank[i] = i + 1
            previous = TotalList[i]
            start = i

    ## Determine the shortest list
    if nA < nB: shortest = A
    else: shortest = B
    nShortest = len(shortest)

    ## Summ the ranks in the shortest list
    W = 0
    for Value in shortest:
        i = 0
        while (i < len(TotalList) and Value != TotalList[i]):
            i += 1
        W += Total_rank[i]

    ## Use the smallest value of $W
    if (W > H0): W = MaxSum - W

    ## Determine the two-tailed level of significance
    p = 0

    ## First calculate the Normal approximation. This can be used to
    ## check whether a significant result is plausable for larger N.
    Permutations = k_out_n(nA, N)
    if (Permutations >= 25000) or (nShortest > 10):
        if W >= H0: Continuity = -0.5
        else: Continuity = 0.5
        Z = (W + Continuity - nShortest *
             (N + 1.0) / 2.0) / sqrt(nA * nB * (N + 1) / 12.0)
        Z = fabs(Z)
        p = 2 * (1 - Arith.lzprob(Z))

    ## The exact level of significance, for large N, first check whether a
    ## significant result is plausable, i.e., the Normal Approximation gives
    ## a $p < 0.25.
    if (nShortest + 1 < 10) and (p < 0.25) and (Permutations < 60000):
        # Remember that $W must be SMALLER than $MaxSum/2=$H0
        Less = CountSmallerRanks(W, 0, len(shortest) - 1, 0, Total_rank)
        # If $Less < $Permutations/2, we have obviously calculated the
        # wrong way. We should have calculated UPWARD (higher than W)
        # We can't do that, but we can calculate $Less for $W-1 and
        # subtract it from $Permutations
        if (2 * Less > Permutations):
            Less = CountSmallerRanks(W - 1, 0,
                                     len(shortest) - 1, 0, Total_rank)
            Less = Permutations - Less
        SumFrequencies = Permutations
        p = 2.0 * Less / SumFrequencies

    return p, W
Esempio n. 4
0
def WMWtest(A,B):
    """
    WMWtest(A,B) -- Computes the Wilcoxon-Mann-Whitney nonparametric W statistic for two distributions

    input:  list of numbers, list of numbers
    output: p-value, W-statistic
    """
    A.sort()
    B.sort()
    TotalList = A + B
    TotalList.sort()

    nA     = len(A)
    nB     = len(B)
    N      = nA + nB
    MaxSum = N*(N+1)/2.0
    H0     = MaxSum / 2.0
    
    
    ## Replace values by ranks
    previous = []
    start = 0
    Total_rank = TotalList[:]
    for i in range(len(TotalList)):
        if (TotalList[i] == previous):
            mean_rank = (start+i+2)/2.0
            for j in range(start,i+1):
                Total_rank[j] = mean_rank
        else:
            Total_rank[i] = i+1
            previous      = TotalList[i]
            start         = i
    
    ## Determine the shortest list
    if nA < nB: shortest = A
    else:       shortest = B
    nShortest = len(shortest);

    ## Summ the ranks in the shortest list
    W = 0
    for Value in shortest:
        i = 0
        while (i < len(TotalList) and Value != TotalList[i]): i += 1
        W += Total_rank[i]

    ## Use the smallest value of $W
    if (W > H0): W = MaxSum - W

    ## Determine the two-tailed level of significance
    p = 0

    ## First calculate the Normal approximation. This can be used to
    ## check whether a significant result is plausable for larger N.
    Permutations = k_out_n(nA, N)
    if (Permutations >= 25000) or (nShortest > 10):
        if W >= H0: Continuity = -0.5
        else:       Continuity =  0.5
        Z = (W+Continuity-nShortest*(N+1.0)/2.0)/sqrt(nA*nB*(N+1)/12.0);
        Z = fabs(Z)
        p = 2*(1-Arith.lzprob(Z))
    

    ## The exact level of significance, for large N, first check whether a
    ## significant result is plausable, i.e., the Normal Approximation gives
    ## a $p < 0.25.
    if (nShortest+1 < 10) and (p < 0.25) and (Permutations < 60000):
        # Remember that $W must be SMALLER than $MaxSum/2=$H0
        Less = CountSmallerRanks(W, 0 , len(shortest)-1, 0, Total_rank)
        # If $Less < $Permutations/2, we have obviously calculated the 
        # wrong way. We should have calculated UPWARD (higher than W)
        # We can't do that, but we can calculate $Less for $W-1 and
        # subtract it from $Permutations
        if (2*Less > Permutations):
            Less = CountSmallerRanks(W-1, 0, len(shortest)-1, 0, Total_rank)
            Less = Permutations - Less
        SumFrequencies = Permutations
        p = 2.0 * Less / SumFrequencies

    return p, W