def compute_confidence_intervals(scaledContainmentsObserverved,
                                 L,
                                 k,
                                 confidence,
                                 s,
                                 debug=False):
    alpha = 1 - confidence
    z_alpha = probit(1 - alpha / 2)
    f1 = lambda Nm: 1 - 1.0 * Nm / L + z_alpha * sqrt(1.0 * Nm * (L - Nm) *
                                                      (1 - s) /
                                                      (s * L**3)) - Cks
    f1_mpf = lambda Nm: mpf(f1(Nm))
    f2 = lambda Nm: 1 - 1.0 * Nm / L - z_alpha * sqrt(1.0 * Nm * (L - Nm) *
                                                      (1 - s) /
                                                      (s * L**3)) - Cks
    f2_mpf = lambda Nm: mpf(f2(Nm))

    all_results = []
    for (CksIx, Cks) in enumerate(scaledContainmentsObserverved):
        sol1_mpf = brentq(f1_mpf, 0, L)
        sol2_mpf = brentq(f2_mpf, 0, L)

        sol1 = sol1_mpf
        sol2 = sol2_mpf

        Clow = 1 - 1.0 * sol1 / L
        Chigh = 1 - 1.0 * sol2 / L

        f3 = lambda pest: mpf((1 - pest)**k + z_alpha * sqrt(
            thm5.var_n_mutated(L, k, pest)) / L - Clow)
        f4 = lambda pest: mpf((1 - pest)**k - z_alpha * sqrt(
            thm5.var_n_mutated(L, k, pest)) / L - Chigh)

        #phigh = newton(f3, Clow)
        #plow = newton(f4, Chigh)

        phigh = brentq(f3, 0.0, 1.0)
        plow = brentq(f4, 0.0, 1.0)

        #print(phigh, f3(phigh))
        #print(plow, f4(plow))

        values = [L, k, confidence, Cks, Clow, Chigh, plow, phigh]
        all_results.append(values)
    return all_results
Beispiel #2
0
def var_c_scaled_one_step(L, k, p, s, confidence):
    bias_factor = 1 - (1 - s)**L
    var_multiplier = (1 - s)**2 / (L**6 * s**2 * bias_factor**2)
    var_inner = lambda pest: L**2 * thm5.var_n_mutated(
        L, k, pest) + var_n_mutated_squared(L, k, pest) - 2 * L * (
            exp_n_mutated_cubed(L, k, pest) - exp_n_mutated(
                L, k, pest) * exp_n_mutated_squared(L, k, pest))
    var_direct = lambda pest: var_multiplier * var_inner(pest)
    print(var_inner(p))
    print(var_multiplier)
    return var_direct(p)
def main():
    global reportProgress, debug

    # parse the command line

    kmerSize = 28
    kmerSequenceLength = 100
    numSequences = None
    noiseKind = None
    pSubstitution = None
    sequenceType = "linear"
    confidence = 0.99
    ciUseInverse = True
    sortBy = "nMutated"
    statsFilename = None
    prngSeed = None
    reportProgress = None
    debug = []

    statsOfInterest = [
        "r1", "k", "L", "confidence", "trials", "q", "E[nMut].theory",
        "StDev[nMut].theory", "inCI(r1est.nMut).obs", "Mean[nMut].obs",
        "StDev[nMut].obs", "RMSE(StDev[nMut])", "RMSE(r1est.nMut)",
        "E[nIsl].theory", "StDev[nIsl].theory", "inCI(r1est.nIsl).obs",
        "Mean[nIsl].obs", "StDev[nIsl].obs", "RMSE(StDev[nIsl])",
        "RMSE(r1est.nIsl)", "r1est.nIsl.impossible"
    ]

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg in ["--help", "-help", "--h", "-h"]):
            usage()
        elif (arg.startswith("--kmer=")) or (arg.startswith("K=")):
            kmerSize = int(argVal)
        elif (arg.startswith("--set=")) or (arg.startswith("N=")) or (
                arg.startswith("L=")):
            kmerSequenceLength = int_with_unit(argVal)
        elif (arg.startswith("--sequences=")) or (arg.startswith("T=")):
            numSequences = int_with_unit(argVal)
        elif (arg.startswith("--poisson=")) or (
                arg.startswith("--noise=")) or (arg.startswith("P=")):
            noiseKind = "poisson"
            pSubstitution = parse_probability(argVal)
        elif (arg.startswith("--bernoulli=")) or (
                arg.startswith("--error=")) or (arg.startswith("B=")) or (
                    arg.startswith("E=")):
            noiseKind = "bernoulli"
            pSubstitution = parse_probability(argVal)
        elif (arg == "--linear"):
            sequenceType = "linear"
        elif (arg == "--circular"):
            sequenceType = "circular"
        elif (arg.startswith("--confidence=")) or (arg.startswith("C=")):
            confidence = parse_probability(argVal)
        elif (arg == "--noinverse"):
            ciUseInverse = False
        elif (arg == "--nosort"):
            sortBy = None
        elif (arg.startswith("--stats=")):
            statsFilename = argVal
        elif (arg.startswith("--seed=")):
            prngSeed = argVal
        elif (arg.startswith("--progress=")):
            reportProgress = int_with_unit(argVal)
        elif (arg == "--debug"):
            debug += ["debug"]
        elif (arg.startswith("--debug=")):
            debug += argVal.split(",")
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    if (pSubstitution == None):
        usage("you have to tell me the mutation probability")

    if (numSequences == None):
        numSequences = 1

    if (sequenceType == "circular"):
        # all the estimator code assumes linear sequences
        usage("circular sequences are not currently supported")

    # set up randomness

    if (prngSeed != None):
        random_seed(prngSeed.encode("utf-8"))
        if ("prng" in debug):
            print("prng = %s" % prngSeed, file=stderr)

    # set up model/generator

    if (noiseKind == "poisson") and (sequenceType == "linear"):
        mutationModel = PoissonModel \
                          (kmerSequenceLength+kmerSize-1,kmerSize,pSubstitution,
                           count_mutated_kmers_linear_naive if ("naive" in debug) else count_mutated_kmers_linear,
                           count_islands_linear)
    elif (noiseKind == "bernoulli") and (sequenceType == "linear"):
        mutationModel = BernoulliModel \
                          (kmerSequenceLength+kmerSize-1,kmerSize,pSubstitution,
                           count_mutated_kmers_linear_naive if ("naive" in debug) else count_mutated_kmers_linear,
                           count_islands_linear)
    elif (noiseKind == "poisson") and (sequenceType == "circular"):
        mutationModel = PoissonModel \
                          (kmerSequenceLength,kmerSize,pSubstitution,
                           count_mutated_kmers_circular_naive if ("naive" in debug) else count_mutated_kmers_circular,
                           count_islands_circular)
    elif (noiseKind == "bernoulli") and (sequenceType == "circular"):
        mutationModel = BernoulliModel \
                          (kmerSequenceLength,kmerSize,pSubstitution,
                           count_mutated_kmers_circular_naive if ("naive" in debug) else count_mutated_kmers_circular,
                           count_islands_circular)
    else:
        assert (False), "internal error"

    # generate sequences and collect stats

    nErrorsObserved = []
    nMutatedObserved = []
    nIslandObserved = []
    r1EstNMutatedObserved = []
    r1EstNIslandObserved = []
    numImpossibleNislands = 0  # counts when nIsland can't be achieved with any r1

    for seqNum in range(numSequences):
        if (reportProgress != None):
            if (1 + seqNum == 1) or ((1 + seqNum) % reportProgress == 0):
                print("testing sequence %d" % (1 + seqNum), file=stderr)

        # generate a (conceptual) sequence pair and collect stats

        mutationModel.generate()
        (nErrors, nMutated, nIsland) = mutationModel.count()
        nErrorsObserved += [nErrors]
        nMutatedObserved += [nMutated]
        nIslandObserved += [nIsland]

        r1EstNMutated = estimate_r1_from_n_mutated(kmerSequenceLength,
                                                   kmerSize, nMutated)
        r1EstNMutatedObserved += [r1EstNMutated]

        # note: when r1 is estimated from nIsland, there can be more than one
        # solution; we take the solution that is closest to r1 estimated from
        # nMutated

        r1EstNIslandList = estimate_r1_from_n_island(kmerSequenceLength,
                                                     kmerSize, nIsland)
        if (len(r1EstNIslandList) == 0):
            r1EstNIsland = float("nan")
        elif (len(r1EstNIslandList) == 1):
            r1EstNIsland = r1EstNIslandList[0]
        else:
            r1EstNIsland = None
            for r1Est in r1EstNIslandList:
                diff = abs(r1Est - r1EstNMutated)
                if (r1EstNIsland == None) or (diff < bestDiff):
                    r1EstNIsland = r1Est
                    bestDiff = diff
        r1EstNIslandObserved += [r1EstNIsland]

        if (impossible_n_island(kmerSequenceLength, kmerSize, nIsland)):
            numImpossibleNislands += 1

    # report per-trial results

    if (sortBy == "nMutated"):
        order = [(nMutatedObserved[ix], ix) for ix in range(numSequences)]
        order.sort()
        order = [ix for (_, ix) in order]
    else:  # if (sortBy == None):
        order = list(range(numSequences))

    header = [
        "L", "K", "r", "trial", "nErr", "nMut", "nIsl", "r1est.nMut",
        "r1.est.nIsl"
    ]
    print("#%s" % "\t".join(header))

    for ix in range(numSequences):
        line = "\t".join(["%d","%d","%0.3f","%d","%d","%d","%d","%0.9f","%0.9f"]) \
             % (kmerSequenceLength,
                kmerSize,
                pSubstitution,
                1+order[ix],
                nErrorsObserved[order[ix]],
                nMutatedObserved[order[ix]],
                nIslandObserved[order[ix]],
                r1EstNMutatedObserved[order[ix]],
                r1EstNIslandObserved[order[ix]])
        print(line)

    # compute stats

    alpha = 1 - confidence
    q = p_mutated(kmerSize, pSubstitution)

    nMutatedMean = sample_mean(nMutatedObserved)
    nMutatedStDev = sqrt(sample_variance(nMutatedObserved))
    predNMutatedMean = exp_n_mutated(kmerSequenceLength, kmerSize,
                                     pSubstitution)
    predNMutatedStDev = sqrt(
        var_n_mutated(kmerSequenceLength, kmerSize, pSubstitution))
    rmseNMutatedStDev = abs(nMutatedStDev - predNMutatedStDev)

    nIslandMean = sample_mean(nIslandObserved)
    nIslandStDev = sqrt(sample_variance(nIslandObserved))
    predNIslandMean = exp_n_island(kmerSequenceLength, kmerSize, pSubstitution)
    predNIslandStDev = sqrt(
        var_n_island(kmerSequenceLength, kmerSize, pSubstitution))
    rmseNIslandStDev = abs(nIslandStDev - predNIslandStDev)

    rmseR1EstNMutated = sqrt(
        mean_squared_error(r1EstNMutatedObserved, pSubstitution))
    rmseR1EstNIsland = sqrt(
        mean_squared_error(r1EstNIslandObserved, pSubstitution))

    (predR1EstNMutatedLow,predR1EstNMutatedHigh) \
                      = confidence_interval_r1_from_n_mutated(kmerSequenceLength,kmerSize,pSubstitution,alpha)
    inConfR1EstNMutated \
                      = in_confidence_interval_q_from_n_mutated(kmerSequenceLength,kmerSize,pSubstitution,alpha,
                                                                nMutatedObserved,useInverse=ciUseInverse)

    (predR1EstNIslandLow,predR1EstNIslandHigh) \
                       = confidence_interval_r1_from_n_island(kmerSequenceLength,kmerSize,pSubstitution,alpha)
    inConfR1EstNIsland = in_confidence_interval_q_from_n_island(
        kmerSequenceLength,
        kmerSize,
        pSubstitution,
        alpha,
        nIslandObserved,
        nMutatedObserved,
        useInverse=ciUseInverse)

    # report stats

    statToText = {}
    statToText["r1"] = "%0.3f" % pSubstitution
    statToText["k"] = "%d" % kmerSize
    statToText["L"] = "%d" % kmerSequenceLength
    statToText["confidence"] = "%0.3f" % confidence
    statToText["trials"] = "%d" % numSequences
    statToText["q"] = "%0.9f" % q
    statToText["E[nMut].theory"] = "%0.9f" % predNMutatedMean
    statToText["StDev[nMut].theory"] = "%0.9f" % predNMutatedStDev
    statToText["CIlow(r1est.nMut).theory"] = "%0.9f" % predR1EstNMutatedLow
    statToText["CIhigh(r1est.nMut).theory"] = "%0.9f" % predR1EstNMutatedHigh
    statToText["inCI(r1est.nMut).obs"] = "%0.9f" % (
        float(inConfR1EstNMutated) / numSequences)
    statToText["Mean[nMut].obs"] = "%0.9f" % nMutatedMean
    statToText["StDev[nMut].obs"] = "%0.9f" % nMutatedStDev
    statToText["RMSE(StDev[nMut])"] = "%0.9f" % rmseNMutatedStDev
    statToText["RMSE(r1est.nMut)"] = "%0.9f" % rmseR1EstNMutated
    statToText["E[nIsl].theory"] = "%0.9f" % predNIslandMean
    statToText["StDev[nIsl].theory"] = "%0.9f" % predNIslandStDev
    statToText["CIlow(r1est.nIsl).theory"] = "%0.9f" % predR1EstNIslandLow
    statToText["CIhigh(r1est.nIsl).theory"] = "%0.9f" % predR1EstNIslandHigh
    statToText["inCI(r1est.nIsl).obs"] = "%0.9f" % (float(inConfR1EstNIsland) /
                                                    numSequences)
    statToText["Mean[nIsl].obs"] = "%0.9f" % nIslandMean
    statToText["StDev[nIsl].obs"] = "%0.9f" % nIslandStDev
    statToText["RMSE(StDev[nIsl])"] = "%0.9f" % rmseNIslandStDev
    statToText["RMSE(r1est.nIsl)"] = "%0.9f" % rmseR1EstNIsland
    statToText["r1est.nIsl.impossible"] = "%0.9f" % (
        float(numImpossibleNislands) / numSequences)

    if (statsFilename != None):
        if (statsFilename.endswith(".gz")) or (
                statsFilename.endswith(".gzip")):
            statsF = gzip_open(statsFilename, "wt")
        else:
            statsF = open(statsFilename, "wt")

        print("#%s" % "\t".join(statsOfInterest), file=statsF)
        statsLine = [statToText[stat] for stat in statsOfInterest]
        print("\t".join(statsLine), file=statsF)
        statsF.close()
    else:
        statW = max(len(stat) for stat in statsOfInterest)
        for stat in statsOfInterest:
            print("%*s = %s" % (stat, statW, statToText[stat]), file=stderr)
def main():
    global reportProgress, debug

    # parse the command line

    kmerSize = 28
    numSequences = None
    noiseKind = None
    pSubstitution = None
    sequenceType = "linear"
    confidence = 0.99
    ciUseInverse = True
    sortBy = "nMutated"
    statsFilename = None
    mutatedFilename = None
    mutateOnly = False
    prngSeed = None
    reportProgress = None
    debug = []

    statsOfInterest = [
        "name", "r1", "k", "L", "confidence", "trials", "q", "Mean[|A|].obs",
        "Mean[|B|].obs", "Mean[|A^B|].obs", "Mean[|AuB|].obs",
        "Mean[nMut.A,B].obs", "Mean[L.A,B].obs", "Mean[r1est.A,B].obs",
        "inCI(r1est.A,B).obs"
    ]

    for arg in argv[1:]:
        if ("=" in arg):
            argVal = arg.split("=", 1)[1]

        if (arg in ["--help", "-help", "--h", "-h"]):
            usage()
        elif (arg.startswith("--kmer=")) or (arg.startswith("K=")):
            kmerSize = int(argVal)
        elif (arg.startswith("--sequences=")) or (arg.startswith("T=")):
            numSequences = int_with_unit(argVal)
        elif (arg.startswith("--poisson=")) or (
                arg.startswith("--noise=")) or (arg.startswith("P=")):
            noiseKind = "poisson"
            pSubstitution = parse_probability(argVal)
        elif (arg.startswith("--bernoulli=")) or (
                arg.startswith("--error=")) or (arg.startswith("B=")) or (
                    arg.startswith("E=")):
            noiseKind = "bernoulli"
            pSubstitution = parse_probability(argVal)
        elif (arg == "--linear"):
            sequenceType = "linear"
        elif (arg == "--circular"):
            sequenceType = "circular"
        elif (arg.startswith("--confidence=")) or (arg.startswith("C=")):
            confidence = parse_probability(argVal)
        elif (arg == "--noinverse"):
            ciUseInverse = False
        elif (arg == "--nosort"):
            sortBy = None
        elif (arg.startswith("--stats=")):
            statsFilename = argVal
        elif (arg.startswith("--mutated=")):
            mutatedFilename = argVal
        elif (arg == "--mutateonly"):
            mutateOnly = True
        elif (arg.startswith("--seed=")):
            prngSeed = argVal
        elif (arg.startswith("--progress=")):
            reportProgress = int_with_unit(argVal)
        elif (arg == "--debug"):
            debug += ["debug"]
        elif (arg.startswith("--debug=")):
            debug += argVal.split(",")
        elif (arg.startswith("--")):
            usage("unrecognized option: %s" % arg)
        else:
            usage("unrecognized option: %s" % arg)

    if (pSubstitution == None):
        usage("you have to tell me the mutation probability")

    if (numSequences == None):
        numSequences = 1

    if (noiseKind == "bernoulli"):
        # the presence of non-ACGT nucleotides isn't considered
        usage("the bernoulli noise model is not currently supported")

    if (sequenceType == "circular"):
        # all the estimator code assumes linear sequences
        usage("circular sequences are not currently supported")

    # set up randomness

    if (prngSeed != None):
        random_seed(prngSeed.encode("utf-8"))

    # open a file to receive the mutated sequences

    mutatedF = None
    if (mutateOnly) and (mutatedFilename == None):
        mutatedF = stdout
    else:
        if (mutatedFilename != None):
            if (mutatedFilename.endswith(".gz")) or (
                    mutatedFilename.endswith(".gzip")):
                mutatedF = gzip_open(mutatedFilename, "wt")
            else:
                mutatedF = open(mutatedFilename, "wt")

    # fetch the *single* input sequence

    numSequencesSeen = 0
    for (seqName, seq) in fasta_sequences(stdin):
        numSequencesSeen += 1
        assert (numSequencesSeen <
                2), "there was more than one sequence in the input"
        seqLen = len(seq)

    assert (numSequencesSeen == 1), "there were no sequences in the input"

    ntSequenceLength = len(seq)
    assert (
        ntSequenceLength >= kmerSize
    ), "input sequence length (%d) is shorter than the kmer size (%d)" % (
        ntSequenceLength, kmerSize)

    distinctKmersA = kmer_set(seq, kmerSize)
    numDistinctKmersA = len(distinctKmersA)

    # set up model/generator

    if (noiseKind == "poisson") and (sequenceType == "linear"):
        kmerSequenceLength = ntSequenceLength - (kmerSize - 1)
        mutationModel = PoissonModel \
                          (seq,kmerSize,pSubstitution,
                           count_mutated_kmers_linear)
    elif (noiseKind == "bernoulli") and (sequenceType == "linear"):
        kmerSequenceLength = ntSequenceLength - (kmerSize - 1)
        mutationModel = BernoulliModel \
                          (seq,kmerSize,pSubstitution,
                           count_mutated_kmers_linear)
    elif (noiseKind == "poisson") and (sequenceType == "circular"):
        kmerSequenceLength = ntSequenceLength
        mutationModel = PoissonModel \
                          (seq,kmerSize,pSubstitution,
                           count_mutated_kmers_circular)
    elif (noiseKind == "bernoulli") and (sequenceType == "circular"):
        kmerSequenceLength = ntSequenceLength
        mutationModel = BernoulliModel \
                          (seq,kmerSize,pSubstitution,
                           count_mutated_kmers_circular)
    else:
        assert (False), "internal error"

    # generate sequences and collect stats

    alpha = 1 - confidence

    nErrorsObserved = []
    nMutatedObserved = []
    r1EstNMutatedObserved = []
    nDistinctAObserved = []
    nDistinctBObserved = []
    nDistinctIntersectionObserved = []
    nDistinctUnionObserved = []
    nMutatedABObserved = []
    kmerSequenceLengthABObserved = []
    r1EstABObserved = []
    inConfR1EstABObserved = []

    for seqNum in range(numSequences):
        if (reportProgress != None):
            if (1 + seqNum == 1) or ((1 + seqNum) % reportProgress == 0):
                print("testing sequence %d" % (1 + seqNum), file=stderr)

        # generate a mutated sequence and collect stats

        mutatedSeq = mutationModel.generate()
        if (mutatedF != None):
            write_fasta(mutatedF, seqName + "_mutation_%d)" % (1 + seqNum),
                        mutatedSeq)
        (nErrors, nMutated) = mutationModel.count()
        nErrorsObserved += [nErrors]
        nMutatedObserved += [nMutated]

        r1EstNMutated = estimate_r1_from_n_mutated(kmerSequenceLength,
                                                   kmerSize, nMutated)
        r1EstNMutatedObserved += [r1EstNMutated]

        distinctKmersB = kmer_set(mutatedSeq, kmerSize)
        numDistinctKmersB = len(distinctKmersB)
        nDistinctKmersIntersection = len(
            distinctKmersA.intersection(distinctKmersB))
        nDistinctKmersUnion = len(distinctKmersA.union(distinctKmersB))
        nDistinctAObserved += [numDistinctKmersA]
        nDistinctBObserved += [numDistinctKmersB]
        nDistinctIntersectionObserved += [nDistinctKmersIntersection]
        nDistinctUnionObserved += [nDistinctKmersUnion]

        kmerSequenceLengthAB = (numDistinctKmersA + numDistinctKmersB) / 2.0
        nMutatedAB = kmerSequenceLengthAB - nDistinctKmersIntersection
        r1EstAB = estimate_r1_from_n_mutated(kmerSequenceLengthAB, kmerSize,
                                             nMutatedAB)
        nMutatedABObserved += [nMutatedAB]
        kmerSequenceLengthABObserved += [kmerSequenceLengthAB]
        r1EstABObserved += [r1EstAB]
        inConfR1EstAB = in_confidence_interval_q_from_n_mutated(
            kmerSequenceLengthAB,
            kmerSize,
            pSubstitution,
            alpha,
            nMutatedAB,
            useInverse=ciUseInverse)
        inConfR1EstABObserved += [inConfR1EstAB]

    # report per-trial results

    if (sortBy == "nMutated"):
        order = [(nDistinctIntersectionObserved[ix], ix)
                 for ix in range(numSequences)]
        order.sort()
        order.reverse()
        order = [ix for (_, ix) in order]
    else:  # if (sortBy == None):
        order = list(range(numSequences))

    header = [
        "L", "K", "r", "trial", "nErr", "nMut", "r1est.nMut", "|A|", "|B|",
        "|A^B|", "|AuB|", "nMut.A,B", "L.A,B", "r1est.A,B", "inCI(r1est.A,B)"
    ]
    print("#%s" % "\t".join(header))

    for ix in range(numSequences):
        line = "\t".join(["%d","%d","%0.3f","%d","%d","%d","%0.9f","%d","%d","%d","%d","%0.1f","%0.1f","%0.9f","%d"]) \
             % (kmerSequenceLength,                       # L
                kmerSize,                                 # K
                pSubstitution,                            # r
                1+order[ix],                              # trial
                nErrorsObserved[order[ix]],               # nErr
                nMutatedObserved[order[ix]],              # nMut
                r1EstNMutatedObserved[order[ix]],         # r1est.nMut
                nDistinctAObserved[order[ix]],            # |A|
                nDistinctBObserved[order[ix]],            # |B|
                nDistinctIntersectionObserved[order[ix]], # |A^B|
                nDistinctUnionObserved[order[ix]],        # |AuB|
                nMutatedABObserved[order[ix]],            # nMut.A,B
                kmerSequenceLengthABObserved[order[ix]],  # L.A,B
                r1EstABObserved[order[ix]],               # r1est.A,B
                inConfR1EstABObserved[order[ix]])         # inCI(r1est.A,B)"]
        print(line)

    if (mutatedF != None) and (mutatedF != stdout):
        mutatedF.close()

    if (mutateOnly):
        exit()

    # compute stats

    q = p_mutated(kmerSize, pSubstitution)

    nMutatedMean = sample_mean(nMutatedObserved)
    nMutatedStDev = sqrt(sample_variance(nMutatedObserved))
    predNMutatedMean = exp_n_mutated(kmerSequenceLength, kmerSize,
                                     pSubstitution)
    predNMutatedStDev = sqrt(
        var_n_mutated(kmerSequenceLength, kmerSize, pSubstitution))
    rmseNMutatedStDev = abs(nMutatedStDev - predNMutatedStDev)
    rmseR1EstNMutated = sqrt(
        mean_squared_error(r1EstNMutatedObserved, pSubstitution))

    (predR1EstNMutatedLow,predR1EstNMutatedHigh) \
                        = confidence_interval_r1_from_n_mutated(kmerSequenceLength,kmerSize,pSubstitution,alpha)
    inConfR1EstNMutated = in_confidence_interval_q_from_n_mutated(
        kmerSequenceLength,
        kmerSize,
        pSubstitution,
        alpha,
        nMutatedObserved,
        useInverse=ciUseInverse)

    nDistinctAMean = sample_mean(nDistinctAObserved)
    nDistinctBMean = sample_mean(nDistinctBObserved)
    nDistinctIntersectionMean \
                         = sample_mean(nDistinctIntersectionObserved)
    nDistinctUnionMean = sample_mean(nDistinctUnionObserved)
    nMutatedABMean = sample_mean(nMutatedABObserved)
    kmerSequenceLengthABMean \
                         = sample_mean(kmerSequenceLengthABObserved)
    r1EstABMean = sample_mean(r1EstABObserved)

    # report stats

    statToText = {}
    statToText["name"] = seqName
    statToText["r1"] = "%0.3f" % pSubstitution
    statToText["k"] = "%d" % kmerSize
    statToText["L"] = "%d" % kmerSequenceLength
    statToText["confidence"] = "%0.3f" % confidence
    statToText["trials"] = "%d" % numSequences
    statToText["q"] = "%0.9f" % q
    statToText["E[nMut].theory"] = "%0.9f" % predNMutatedMean
    statToText["StDev[nMut].theory"] = "%0.9f" % predNMutatedStDev
    statToText["CIlow(r1est.nMut).theory"] = "%0.9f" % predR1EstNMutatedLow
    statToText["CIhigh(r1est.nMut).theory"] = "%0.9f" % predR1EstNMutatedHigh
    statToText["inCI(r1est.nMut).obs"] = "%0.9f" % (
        float(inConfR1EstNMutated) / numSequences)
    statToText["Mean[nMut].obs"] = "%0.9f" % nMutatedMean
    statToText["StDev[nMut].obs"] = "%0.9f" % nMutatedStDev
    statToText["RMSE(StDev[nMut])"] = "%0.9f" % rmseNMutatedStDev
    statToText["RMSE(r1est.nMut)"] = "%0.9f" % rmseR1EstNMutated
    statToText["Mean[|A|].obs"] = "%d" % nDistinctAMean
    statToText["Mean[|B|].obs"] = "%d" % nDistinctBMean
    statToText["Mean[|A^B|].obs"] = "%d" % nDistinctIntersectionMean
    statToText["Mean[|AuB|].obs"] = "%d" % nDistinctUnionMean
    statToText["Mean[nMut.A,B].obs"] = "%d" % nMutatedABMean
    statToText["Mean[L.A,B].obs"] = "%d" % kmerSequenceLengthABMean
    statToText["Mean[r1est.A,B].obs"] = "%0.9f" % r1EstABMean
    statToText["inCI(r1est.A,B).obs"] = "%0.9f" % (
        float(sum(inConfR1EstABObserved)) / numSequences)

    if (statsFilename != None):
        if (statsFilename.endswith(".gz")) or (
                statsFilename.endswith(".gzip")):
            statsF = gzip_open(statsFilename, "wt")
        else:
            statsF = open(statsFilename, "wt")

        print("#%s" % "\t".join(statsOfInterest), file=statsF)
        statsLine = [statToText[stat] for stat in statsOfInterest]
        print("\t".join(statsLine), file=statsF)
        statsF.close()
    else:
        statW = max(len(stat) for stat in statsOfInterest)
        for stat in statsOfInterest:
            print("%*s = %s" % (stat, statW, statToText[stat]), file=stderr)
Beispiel #5
0
def fourth_moment_using_normal(L, k, p):
    mu = exp_n_mutated(L, k, p)
    var = var_n_mutated(L, k, p)
    fourth_moment = mu**4 + 6 * mu**2 * var + 3 * var**2
    return fourth_moment
Beispiel #6
0
def third_moment_nmut_using_normal(L, k, p):
    mu = exp_n_mutated(L, k, p)
    var = var_n_mutated(L, k, p)
    third_moment = mu**3 + 3 * mu * var
    return third_moment
Beispiel #7
0
def exp_n_mutated_squared(L, k, p):
    return var_n_mutated(L, k, p) + exp_n_mutated(L, k, p)**2
Beispiel #8
0
def var_test(L, k, p, s, confidence):
    print('test')
    print(thm5.var_n_mutated(L, k, p) / L**2)