Beispiel #1
0
def _negative_log_likelihood(params, log_T, E):
    r"""
    The log likelihood is:

    .. math:: sum_{all deaths events} \log{h(t_i)} + sum_{all events} \log{S(t_i)}

    where h is the hazard function, and S is the survival function. 


    1. Why is this scaled by n? When T is large, then the gradient becomes more and more sensitive
    and smaller and smaller steps are needed to achieve a less than gtol (gtol is the abs min value in the gradient). Unfortunately
    for very "spikey" log-likelihoods / functions, a small enough step size is not present and the gradient never gets near 0. 

    Another way to think of this: ll ~= E[ll_i] * N, so gradient = diff(E[ll]) * N, so we need to scale ll. 
    """
    n = log_T.shape[0]
    mu, sigma = params
    if sigma < 0.0001:
        return 1e16

    Z = (log_T - mu) / sigma
    log_sf = norm.logsf(Z)

    ll = (E *
          (norm.logpdf(Z) - log_T - log(sigma) - log_sf)).sum() + log_sf.sum()
    return -ll / n
Beispiel #2
0
def q2qnbinom(counts, input_mean, output_mean, dispersion):
    """ Quantile to Quantile for a negative binomial
    """
    zero = logical_or(input_mean < 1e-14, output_mean < 1e-14)
    input_mean[zero] = input_mean[zero] + 0.25
    output_mean[zero] = output_mean[zero] + 0.25
    ri = 1 + multiply(np.matrix(dispersion).T, input_mean)
    vi = multiply(input_mean, ri)
    rO = 1 + multiply(np.matrix(dispersion).T, output_mean)
    vO = multiply(output_mean, rO)
    i = counts >= input_mean
    low = logical_not(i)
    p1 = empty(counts.shape, dtype=np.float64)
    p2 = p1.copy()
    q1, q2 = p1.copy(), p1.copy()
    if i.any():
        p1[i] = norm.logsf(counts[i], loc=input_mean[i], scale=np.sqrt(vi[i]))[0, :]
        p2[i] = gamma.logsf(counts[i], (input_mean / ri)[i], scale=ri[i])[0, :]
        q1[i] = norm.ppf(1 - np.exp(p1[i]), output_mean[i], np.sqrt(vO[i]))[0, :]
        q2[i] = gamma.ppf(1 - np.exp(p2[i]), np.divide(output_mean[i], rO[i]), scale=rO[i])[0, :]

    if low.any():
        p1[low] = norm.logcdf(counts[low], loc=input_mean[low], scale=np.sqrt(vi[low]))[0, :]
        p2[low] = gamma.logcdf(counts[low], input_mean[low] / ri[low], scale=ri[low])[0, :]
        q1[low] = norm.ppf(np.exp(p1[low]), loc=output_mean[low], scale=np.sqrt(vO[low]))[0, :]
        q2[low] = gamma.ppf(np.exp(p2[low]), output_mean[low] / rO[low], scale=rO[low])[0, :]
    return (q1 + q2) / 2
Beispiel #3
0
 def __next__(self):
     if len(self.dummy) < 1:
         raise StopIteration
     probas = norm.logsf(self.a,
                         loc=self.mu,
                         scale=np.sqrt(np.diag(self.Sigma)))
     chosen_index = self.orderer(probas)
     condition_value = self.__class__.truncnorm_mean(
         mean=self.mu[chosen_index],
         var=self.Sigma[chosen_index, chosen_index],
         a=self.a[chosen_index])
     [
         self.__class__.reorder_1d(x, chosen_index)
         for x in [self.mu, self.a, self.dummy]
     ]
     self.__class__.reorder_2d(self.Sigma, chosen_index)
     self.mu, self.Sigma = self.__class__.condition_on_first_variable(
         mu=self.mu, Sigma=self.Sigma, val=condition_value)
     if self.debug:
         print(self.mu)
         print(self.Sigma)
     self.a = self.a[1:]
     res = self.dummy[0]
     self.dummy = self.dummy[1:]
     return res
Beispiel #4
0
 def loglik(mu):
     ll = -sum(norm.logpdf(values[not_censored], loc=mu, scale=std_est))
     if n_left_cens > 0:
         ll -= sum(
             norm.logcdf(values[left_censored], loc=mu, scale=std_est))
     if n_right_cens > 0:
         ll -= sum(
             norm.logsf(values[right_censored], loc=mu, scale=std_est))
     return ll
Beispiel #5
0
def discretized_normal_log(yvals, mean, sd):
    s = yvals[1] - yvals[0]  #distance between points in ygrid
    bot_dist = norm.logcdf(
        x=yvals - s / 2.0, loc=mean, scale=sd
    )  #LOG cdf at midpoint between point in grid and previous point
    top_dist = norm.logcdf(
        x=yvals + s / 2.0, loc=mean,
        scale=sd)  #LOG cdf at midpoint between point in grid and next point
    diff = top_dist + np.log(
        -np.expm1(bot_dist - top_dist))  #log1p difference formula
    diff[0] = top_dist[0]  #first value should integrate from -inf
    diff[-1] = norm.logsf(x=ygrid[-1] - s / 2.0, loc=mean,
                          scale=sd)  #last value should integrate to inf
    return diff
Beispiel #6
0
 def ps_not_dark_hot(self, cube_in, write_fits=False):
     name = 'ps_not_dark_hot'
     ps = np.ones_like(cube_in.counts.data, dtype=np.bool)
     cts, exp = cube_in.cts_exp_shadowgram(*self.erange_dark)
     lp_dd, lp_hd = shadowgram.logprob_not_dark_hot(cts, exp, self.ref_dark)
     cts, exp = cube_in.cts_exp_shadowgram(*self.erange_hot)
     lp_dh, lp_hh = shadowgram.logprob_not_dark_hot(cts, exp, self.ref_hot)
     dark, hot = \
         [np.logical_and(lp < norm.logsf(self.sigma_max),
                         np.logical_not(cts.mask))
          for lp in (lp_dd, lp_hh)]
     ps[:, np.logical_or(dark, hot)] = False
     if write_fits:
         pixels = np.zeros(cts.shape, dtype=np.ubyte)
         pixels[cts.mask == True] += 1
         pixels[dark] += 2
         pixels[hot] += 4
         self.write_outlier_map(name, cube_in.scwid, pixels)
     return ps
Beispiel #7
0
def log_one_minus_fdp_gaussian(params, logfpr):
    """
    :param params:
        'sigma' --- is the normalized noise level: std divided by global L2 sensitivity
    :param logfpr: log of False positive rate --- input to the fDP function
    :return: log(1-f(x)).
    """
    sigma = params['sigma']
    # assert(sigma > 0)
    assert (sigma >= 0)
    if sigma == 0:
        return 0
    else:
        if np.isneginf(logfpr):
            return -np.inf
        else:

            norm_ppf_one_minus_fpr = utils.stable_norm_ppf_one_minus_x(logfpr)

            return norm.logsf(norm_ppf_one_minus_fpr - 1 / sigma)
Beispiel #8
0
def q2qnbinom(counts, input_mean, output_mean, dispersion):
    """ Quantile to Quantile for a negative binomial
    """
    zero = logical_or(input_mean < 1e-14, output_mean < 1e-14)
    input_mean[zero] = input_mean[zero] + 0.25
    output_mean[zero] = output_mean[zero] + 0.25
    ri = 1 + multiply(np.matrix(dispersion).T, input_mean)
    vi = multiply(input_mean, ri)
    rO = 1 + multiply(np.matrix(dispersion).T, output_mean)
    vO = multiply(output_mean, rO)
    i = counts >= input_mean
    low = logical_not(i)
    p1 = empty(counts.shape, dtype=np.float64)
    p2 = p1.copy()
    q1, q2 = p1.copy(), p1.copy()
    if i.any():
        p1[i] = norm.logsf(counts[i], loc=input_mean[i],
                           scale=np.sqrt(vi[i]))[0, :]
        p2[i] = gamma.logsf(counts[i], (input_mean / ri)[i], scale=ri[i])[0, :]
        q1[i] = norm.ppf(1 - np.exp(p1[i]), output_mean[i],
                         np.sqrt(vO[i]))[0, :]
        q2[i] = gamma.ppf(1 - np.exp(p2[i]),
                          np.divide(output_mean[i], rO[i]),
                          scale=rO[i])[0, :]

    if low.any():
        p1[low] = norm.logcdf(counts[low],
                              loc=input_mean[low],
                              scale=np.sqrt(vi[low]))[0, :]
        p2[low] = gamma.logcdf(counts[low],
                               input_mean[low] / ri[low],
                               scale=ri[low])[0, :]
        q1[low] = norm.ppf(np.exp(p1[low]),
                           loc=output_mean[low],
                           scale=np.sqrt(vO[low]))[0, :]
        q2[low] = gamma.ppf(np.exp(p2[low]),
                            output_mean[low] / rO[low],
                            scale=rO[low])[0, :]
    return ((q1 + q2) / 2)
Beispiel #9
0
 def prob_inside_dg_each(self,
                         vs,
                         g_idx,
                         logscale=False,
                         dg_mean=None,
                         dg_var=None,
                         offset=0.0):
     '''
     Args:
         vs: 2D array (num_vs, ndim)
         g_idx: integer index of an axis
         logscale: return logpdf if True
     Returns:
         pd: 1D array (num_vs), P('v_g' is inside) for each 'v' in 'vs'
     '''
     vs = np.atleast_2d(vs)
     if dg_mean is None or dg_var is None:
         dg_mean, dg_var = self.posterior_dg(vs, g_idx, full_cov=False)
     if logscale:
         return norm.logsf(offset, dg_mean, np.sqrt(dg_var))
     else:
         # sf: survival function, 1 - cdf, numerically more stable
         return norm.sf(offset, dg_mean, np.sqrt(dg_var))
Beispiel #10
0
def pnorm(q, mean=0, sd=1, lowertail=True, log=False):
    """
    ============================================================================
                                                                        pnorm()
    ============================================================================
    The cumulative distribution function for the normal distribution.
    You provide a value along the normal distribution (eg x=3) or array of
    values, and it returns what proportion of values lie below it (the quantile)

    Alternatively, if you select lowertail=False, it returns the proportion of
    values that are above it.

    USAGE:
    cnorm(mean=0, sd=1, type="equal", conf=0.95)
    dnorm(x, mean=0, sd=1, log=False)
    pnorm(q, mean=0, sd=1, lowertail=True, log=False)
    qnorm(p, mean=0, sd=1, lowertail=True, log=False)
    rnorm(n=1, mean=0, sd=1)

    :param x (float, array of floats): The values along the distribution.
    :param mean (float):     mean of the distribution
    :param sd (float):       standard deviation
    :param lowertail (bool): are you interested in what proportion of values
                             lie beneath x? or above x (false)?
    :param log (bool):       take the log?
    :return:        an array of quantiles() corresponding to the values in x
    ============================================================================
    """
    if lowertail and not log:
        return norm.cdf(x=q, loc=mean, scale=sd)
    elif not lowertail and not log:
        return norm.sf(x=q, loc=mean, scale=sd)
    elif lowertail and log:
        return norm.logcdf(x=q, loc=mean, scale=sd)
    else:
        return norm.logsf(x=q, loc=mean, scale=sd)
Beispiel #11
0
def pnorm(x, mean=0, sd=1, lowertail=True, log=False):
    """
    ============================================================================
                                                                        pnorm()
    ============================================================================
    The cumulative distribution function for the normal distribution.
    You provide a value along the normal distribution (eg x=3) or array of
    values, and it returns what proportion of values lie below it (the quantile)

    Alternatively, if you select lowertail=False, it returns the proportion of
    values that are above it.

    USAGE:
    cnorm(mean=0, sd=1, type="equal", conf=0.95)
    dnorm(x, mean=0, sd=1, log=False)
    pnorm(q, mean=0, sd=1, lowertail=True, log=False)
    qnorm(p, mean=0, sd=1, lowertail=True, log=False)
    rnorm(n=1, mean=0, sd=1)

    :param x (float, array of floats): The values along the distribution.
    :param mean (float):     mean of the distribution
    :param sd (float):       standard deviation
    :param lowertail (bool): are you interested in what proportion of values
                             lie beneath x? or above x (false)?
    :param log (bool):       take the log?
    :return:        an array of quantiles() corresponding to the values in x
    ============================================================================
    """
    if lowertail and not log:
        return norm.cdf(x, loc=mean, scale=sd)
    elif not lowertail and not log:
        return norm.sf(x, loc=mean, scale=sd)
    elif lowertail and log:
        return norm.logcdf(x, loc=mean, scale=sd)
    else:
        return norm.logsf(x, loc=mean, scale=sd)
Beispiel #12
0
def Zscore_to_Pvalue(zscore):
    """Function that converts zscores to pvalues"""
    abs_zscore = np.absolute(zscore)
    pvalue = -1 * (norm.logsf(abs_zscore) / math.log(10))
    return pvalue
import tqdm
import itertools
from scipy.stats import norm

ACCURACY = 2**-300

#######################
#  error computation  #
#######################

delta = lambda n, q, sd: 1 - erf(
    round(q / 4) / (sqrt(2) * sd**2 * sqrt(2 * n)))
err = lambda n, q, sd, m: (1 - (1 - delta(n, q, sd))**m).n().log(2)

delta2 = lambda n, q, sd, B: norm.logsf(float(round(q / 2 / 2**B)),
                                        scale=float(sd**2 * sqrt(2 * n))
                                        ) / np.log(2) + 1
err2 = lambda n, q, sd, B, m: delta2(n, q, sd, B) + np.log2(m)

#########################
# security estimation   #
#########################

try:
    load('https://bitbucket.org/malb/lwe-estimator/raw/HEAD/estimator.py')

    def est(n, q, sd):
        alpha = sqrt(2 * pi) * sd / RR(q)
        m = n
        secret_distribution = "normal"
        success_probability = 0.99
Beispiel #14
0
 def log_sf(self, s):
     return norm.logsf(s, loc=self.mu, scale=self.std)
Beispiel #15
0
n = 468900
p = 0.00018
x = 244

from scipy.stats import norm
the_scale = sqrt(n * p * (1 - p))
the_loc = n * p
# survival function = 1 - cdf
output = -norm.logsf(x, loc=the_loc, scale=the_scale)
norm.logcdf(x, loc=the_loc, scale=the_scale)
print(output)
Beispiel #16
0
 def logG(self, t, x):
     Gamma_e = self.Gamma[t][0:t].reshape((t, 1))
     x = x[:, 0:t]
     truncations = (-x @ Gamma_e + self.a[t]).reshape((len(x), ))
     return norm.logsf(truncations)
Beispiel #17
0
    def test_pairs(self):
        mean_ins = 142
        std_ins = 81.5431220178
        max_ins = 386
        in_bam = tk_bam.create_bam_infile(PAIR_FILE)
        for idx, read in enumerate(in_bam):
            if idx % 2 == 0:
                read1 = read
            else:
                read2 = read
                rp = tk_readpairs.ReadPair(read1, read2)
                if idx == 1:  # HISEQ-002:203:HBERLADXX:2:1109:13682:22490
                    self.assertEqual(tk_readpairs.map_qstart(read1), 0)
                    self.assertEqual(tk_readpairs.map_qstart(read2), 0)
                    self.assertEqual(rp.sv_type, tk_readpairs.DEL_STR)
                    range1, range2 = rp.get_break_ranges(max_ins)
                    # Maximum distance between reads is max_ins - 2 * 88
                    self.assertEqual(
                        range1, (189704422 + 88, 189704422 + max_ins - 88 + 1))
                    self.assertEqual(
                        range2, (189783402 - max_ins + 2 * 88, 189783402 + 1))
                if idx == 13:  # HISEQ-002:203:HBERLADXX:2:2201:3706:93604
                    self.assertEqual(tk_readpairs.map_qstart(read1), 0)
                    self.assertEqual(tk_readpairs.map_qstart(read2), 0)
                    self.assertEqual(rp.sv_type, tk_readpairs.INV_STR)
                    range1, range2 = rp.get_break_ranges(max_ins)
                    self.assertEqual(
                        range1, (162503343 - max_ins + 2 * 88, 162503343 + 1))
                    self.assertEqual(
                        range2, (162627709 - max_ins + 2 * 88, 162627709 + 1))
                    self.assertEqual(
                        tk_readpairs.get_pair_break_dist(read1, 162503243),
                        188)
                    self.assertEqual(
                        tk_readpairs.get_pair_break_dist(read2, 162627609),
                        154)
                if idx == 19:  # HISEQ-002:203:HBERLADXX:1:2214:1366:24373
                    self.assertEqual(tk_readpairs.map_qstart(read1), 0)
                    self.assertEqual(tk_readpairs.map_qstart(read2), 0)
                    self.assertEqual(rp.sv_type, tk_readpairs.DEL_STR)
                    range1, range2 = rp.get_break_ranges(max_ins)
                    # +8 because this read has 8bp of deletions with respect to the reference
                    # so the last aligned position on the reference is start + 88 + 8
                    self.assertEqual(
                        range1,
                        (78967094 + 88 + 8, 78967094 + max_ins - 88 + 1 + 8))
                    self.assertEqual(
                        range2, (79036484 - max_ins + 2 * 88, 79036484 + 1))
                    self.assertEqual(
                        tk_readpairs.get_pair_break_dist(read1, 78967294), 200)
                    self.assertEqual(
                        tk_readpairs.get_pair_break_dist(read2, 79036284), 288)

                    # The ranges are single points. range2[1] is outside the valid range.
                    probs = rp.get_break_lr(
                        np.arange(range1[0], range1[1]),
                        np.ones((range1[1] - range1[0], )) * range2[1], 0,
                        lambda x: 0.0)
                    assert (np.all(probs == 0))
                    # probabilities returned should be decreasing because we "used up" all the
                    # insert size on the side of read2
                    probs = rp.get_break_lr(
                        np.arange(range1[0], range1[1]),
                        np.ones((range1[1] - range1[0], )) * range2[1],
                        max_ins, lambda x: norm.logsf(x, mean_ins, std_ins))
                    self.assertEqual(list(probs), list(sorted(probs)[::-1]))
                    # probabilities returned should be increasing because the mean insert size is
                    # pretty large.
                    probs = rp.get_break_lr(
                        np.arange(range1[0], range1[1]),
                        np.ones((range1[1] - range1[0], )) * range2[0], 10000,
                        lambda x: norm.logsf(x, 10000, std_ins))
                    self.assertEqual(list(probs), list(sorted(probs)))
        in_bam.close()
Beispiel #18
0
 def _logG(self, t: int, a_t: float, x: np.ndarray, assert_mode: int = 1):
     assert x.shape[1] == t + assert_mode
     Gamma_e = self.Gamma[t][0:t].reshape((t, 1))
     x = x[:, 0:t]
     truncations = (-x @ Gamma_e + a_t).reshape((len(x), ))
     return norm.logsf(truncations)
Beispiel #19
0
while True:
    i += 1
    if i % 1000 == 0:
        print(i)
    rmse = K.proc_next_packet()
    if rmse == -1:
        break
    RMSEs.append(rmse)
stop = time.time()
print("Complete. Time elapsed: " + str(stop - start))

# Here we demonstrate how one can fit the RMSE scores to a log-normal distribution (useful for finding/setting a cutoff threshold \phi)
from scipy.stats import norm
benignSample = np.log(RMSEs[FMgrace + ADgrace + 1:100000])
logProbs = norm.logsf(
    np.log(RMSEs), np.mean(benignSample), np.std(benignSample)
)  #Return value: log of (1-log(RMSEs): integral of probability density function)

# plot the RMSE anomaly scores
print("Plotting results")
from matplotlib import pyplot as plt
from matplotlib import cm
plt.figure(figsize=(10, 5))
fig = plt.scatter(range(FMgrace + ADgrace + 1, len(RMSEs)),
                  RMSEs[FMgrace + ADgrace + 1:],
                  s=0.1,
                  c=logProbs[FMgrace + ADgrace + 1:],
                  cmap='RdYlGn')
plt.yscale("log")  #Change the scale of the axis to log
plt.title("Anomaly Scores from Kitsune's Execution Phase")
plt.ylabel("RMSE (log scaled)")
 def log_sf(self, s):
     return norm.logsf(s, loc=self.mu, scale=self.std)
Beispiel #21
0
 def sigma_loglikelihood(sigma):
     wp_loglikelihood = norm.logpdf(wp, wp_pred, sigma)
     bp_loglikelihood = norm.logsf(bp, bp_pred, sigma)
     return -(np.sum(wp_loglikelihood) + np.sum(bp_loglikelihood))
Beispiel #22
0
while True:
    i += 1
    if i % 1000 == 0:
        print(i)
    rmse = K.proc_next_packet()
    if rmse == -1:
        break
    RMSEs.append(rmse)
stop = time.time()
print("Complete. Time elapsed: " + str(stop - start))

# Here we demonstrate how one can fit the RMSE scores to a log-normal distribution (useful for finding/setting a cutoff threshold \phi)
from scipy.stats import norm

benignSample = np.log(RMSEs[FMgrace + ADgrace + 1:100000])
logProbs = norm.logsf(np.log(RMSEs), np.mean(benignSample),
                      np.std(benignSample))

# plot the RMSE anomaly scores
print("Plotting results")
from matplotlib import pyplot as plt
from matplotlib import cm

plt.figure(figsize=(10, 5))
fig = plt.scatter(range(FMgrace + ADgrace + 1, len(RMSEs)),
                  RMSEs[FMgrace + ADgrace + 1:],
                  s=0.1,
                  c=logProbs[FMgrace + ADgrace + 1:],
                  cmap='RdYlGn')
plt.yscale("log")
plt.title("Anomaly Scores from Kitsune's Execution Phase")
plt.ylabel("RMSE (log scaled)")
Beispiel #23
0
        print(i + 1)
    rm, lt = C.process(X[i, ])
    rms.append(rm)
    lts.append(lt)
stop = time.time()
print("Complete. Time elapsed: " + str(stop - start))

prms = np.array(rms[200000:])
plts = np.array(lts[200000:])
scores = np.zeros(400000)

scores[:100000] = 2 * np.exp(10 * prms[:100000])
scores[100000:] = np.exp(10 * prms[100000:]) + np.exp(10 * plts[100000:])
index = np.array(range(len(scores)))
benignSample = np.log(scores[:50000])
logProbs = norm.logsf(np.log(scores), np.mean(benignSample),
                      np.std(benignSample))

fig3 = plt.figure(figsize=(12.8, 6.4))
plt.scatter(index, scores, s=4, c=logProbs, cmap='RdYlGn')
plt.ylim([min(scores), max(scores) + 1.5])
plt.annotate('Normal Traffic',
             xy=(index[26000], 3),
             xytext=(index[0], max(scores)),
             arrowprops=dict(facecolor='black', shrink=0.005),
             fontsize='large')
plt.annotate('DDoS Attack Traffic',
             xy=(index[100000], max(scores)),
             xytext=(index[0], max(scores) + 1),
             arrowprops=dict(facecolor='black', shrink=0.005),
             fontsize='large')