def _negative_log_likelihood(params, log_T, E): r""" The log likelihood is: .. math:: sum_{all deaths events} \log{h(t_i)} + sum_{all events} \log{S(t_i)} where h is the hazard function, and S is the survival function. 1. Why is this scaled by n? When T is large, then the gradient becomes more and more sensitive and smaller and smaller steps are needed to achieve a less than gtol (gtol is the abs min value in the gradient). Unfortunately for very "spikey" log-likelihoods / functions, a small enough step size is not present and the gradient never gets near 0. Another way to think of this: ll ~= E[ll_i] * N, so gradient = diff(E[ll]) * N, so we need to scale ll. """ n = log_T.shape[0] mu, sigma = params if sigma < 0.0001: return 1e16 Z = (log_T - mu) / sigma log_sf = norm.logsf(Z) ll = (E * (norm.logpdf(Z) - log_T - log(sigma) - log_sf)).sum() + log_sf.sum() return -ll / n
def q2qnbinom(counts, input_mean, output_mean, dispersion): """ Quantile to Quantile for a negative binomial """ zero = logical_or(input_mean < 1e-14, output_mean < 1e-14) input_mean[zero] = input_mean[zero] + 0.25 output_mean[zero] = output_mean[zero] + 0.25 ri = 1 + multiply(np.matrix(dispersion).T, input_mean) vi = multiply(input_mean, ri) rO = 1 + multiply(np.matrix(dispersion).T, output_mean) vO = multiply(output_mean, rO) i = counts >= input_mean low = logical_not(i) p1 = empty(counts.shape, dtype=np.float64) p2 = p1.copy() q1, q2 = p1.copy(), p1.copy() if i.any(): p1[i] = norm.logsf(counts[i], loc=input_mean[i], scale=np.sqrt(vi[i]))[0, :] p2[i] = gamma.logsf(counts[i], (input_mean / ri)[i], scale=ri[i])[0, :] q1[i] = norm.ppf(1 - np.exp(p1[i]), output_mean[i], np.sqrt(vO[i]))[0, :] q2[i] = gamma.ppf(1 - np.exp(p2[i]), np.divide(output_mean[i], rO[i]), scale=rO[i])[0, :] if low.any(): p1[low] = norm.logcdf(counts[low], loc=input_mean[low], scale=np.sqrt(vi[low]))[0, :] p2[low] = gamma.logcdf(counts[low], input_mean[low] / ri[low], scale=ri[low])[0, :] q1[low] = norm.ppf(np.exp(p1[low]), loc=output_mean[low], scale=np.sqrt(vO[low]))[0, :] q2[low] = gamma.ppf(np.exp(p2[low]), output_mean[low] / rO[low], scale=rO[low])[0, :] return (q1 + q2) / 2
def __next__(self): if len(self.dummy) < 1: raise StopIteration probas = norm.logsf(self.a, loc=self.mu, scale=np.sqrt(np.diag(self.Sigma))) chosen_index = self.orderer(probas) condition_value = self.__class__.truncnorm_mean( mean=self.mu[chosen_index], var=self.Sigma[chosen_index, chosen_index], a=self.a[chosen_index]) [ self.__class__.reorder_1d(x, chosen_index) for x in [self.mu, self.a, self.dummy] ] self.__class__.reorder_2d(self.Sigma, chosen_index) self.mu, self.Sigma = self.__class__.condition_on_first_variable( mu=self.mu, Sigma=self.Sigma, val=condition_value) if self.debug: print(self.mu) print(self.Sigma) self.a = self.a[1:] res = self.dummy[0] self.dummy = self.dummy[1:] return res
def loglik(mu): ll = -sum(norm.logpdf(values[not_censored], loc=mu, scale=std_est)) if n_left_cens > 0: ll -= sum( norm.logcdf(values[left_censored], loc=mu, scale=std_est)) if n_right_cens > 0: ll -= sum( norm.logsf(values[right_censored], loc=mu, scale=std_est)) return ll
def discretized_normal_log(yvals, mean, sd): s = yvals[1] - yvals[0] #distance between points in ygrid bot_dist = norm.logcdf( x=yvals - s / 2.0, loc=mean, scale=sd ) #LOG cdf at midpoint between point in grid and previous point top_dist = norm.logcdf( x=yvals + s / 2.0, loc=mean, scale=sd) #LOG cdf at midpoint between point in grid and next point diff = top_dist + np.log( -np.expm1(bot_dist - top_dist)) #log1p difference formula diff[0] = top_dist[0] #first value should integrate from -inf diff[-1] = norm.logsf(x=ygrid[-1] - s / 2.0, loc=mean, scale=sd) #last value should integrate to inf return diff
def ps_not_dark_hot(self, cube_in, write_fits=False): name = 'ps_not_dark_hot' ps = np.ones_like(cube_in.counts.data, dtype=np.bool) cts, exp = cube_in.cts_exp_shadowgram(*self.erange_dark) lp_dd, lp_hd = shadowgram.logprob_not_dark_hot(cts, exp, self.ref_dark) cts, exp = cube_in.cts_exp_shadowgram(*self.erange_hot) lp_dh, lp_hh = shadowgram.logprob_not_dark_hot(cts, exp, self.ref_hot) dark, hot = \ [np.logical_and(lp < norm.logsf(self.sigma_max), np.logical_not(cts.mask)) for lp in (lp_dd, lp_hh)] ps[:, np.logical_or(dark, hot)] = False if write_fits: pixels = np.zeros(cts.shape, dtype=np.ubyte) pixels[cts.mask == True] += 1 pixels[dark] += 2 pixels[hot] += 4 self.write_outlier_map(name, cube_in.scwid, pixels) return ps
def log_one_minus_fdp_gaussian(params, logfpr): """ :param params: 'sigma' --- is the normalized noise level: std divided by global L2 sensitivity :param logfpr: log of False positive rate --- input to the fDP function :return: log(1-f(x)). """ sigma = params['sigma'] # assert(sigma > 0) assert (sigma >= 0) if sigma == 0: return 0 else: if np.isneginf(logfpr): return -np.inf else: norm_ppf_one_minus_fpr = utils.stable_norm_ppf_one_minus_x(logfpr) return norm.logsf(norm_ppf_one_minus_fpr - 1 / sigma)
def q2qnbinom(counts, input_mean, output_mean, dispersion): """ Quantile to Quantile for a negative binomial """ zero = logical_or(input_mean < 1e-14, output_mean < 1e-14) input_mean[zero] = input_mean[zero] + 0.25 output_mean[zero] = output_mean[zero] + 0.25 ri = 1 + multiply(np.matrix(dispersion).T, input_mean) vi = multiply(input_mean, ri) rO = 1 + multiply(np.matrix(dispersion).T, output_mean) vO = multiply(output_mean, rO) i = counts >= input_mean low = logical_not(i) p1 = empty(counts.shape, dtype=np.float64) p2 = p1.copy() q1, q2 = p1.copy(), p1.copy() if i.any(): p1[i] = norm.logsf(counts[i], loc=input_mean[i], scale=np.sqrt(vi[i]))[0, :] p2[i] = gamma.logsf(counts[i], (input_mean / ri)[i], scale=ri[i])[0, :] q1[i] = norm.ppf(1 - np.exp(p1[i]), output_mean[i], np.sqrt(vO[i]))[0, :] q2[i] = gamma.ppf(1 - np.exp(p2[i]), np.divide(output_mean[i], rO[i]), scale=rO[i])[0, :] if low.any(): p1[low] = norm.logcdf(counts[low], loc=input_mean[low], scale=np.sqrt(vi[low]))[0, :] p2[low] = gamma.logcdf(counts[low], input_mean[low] / ri[low], scale=ri[low])[0, :] q1[low] = norm.ppf(np.exp(p1[low]), loc=output_mean[low], scale=np.sqrt(vO[low]))[0, :] q2[low] = gamma.ppf(np.exp(p2[low]), output_mean[low] / rO[low], scale=rO[low])[0, :] return ((q1 + q2) / 2)
def prob_inside_dg_each(self, vs, g_idx, logscale=False, dg_mean=None, dg_var=None, offset=0.0): ''' Args: vs: 2D array (num_vs, ndim) g_idx: integer index of an axis logscale: return logpdf if True Returns: pd: 1D array (num_vs), P('v_g' is inside) for each 'v' in 'vs' ''' vs = np.atleast_2d(vs) if dg_mean is None or dg_var is None: dg_mean, dg_var = self.posterior_dg(vs, g_idx, full_cov=False) if logscale: return norm.logsf(offset, dg_mean, np.sqrt(dg_var)) else: # sf: survival function, 1 - cdf, numerically more stable return norm.sf(offset, dg_mean, np.sqrt(dg_var))
def pnorm(q, mean=0, sd=1, lowertail=True, log=False): """ ============================================================================ pnorm() ============================================================================ The cumulative distribution function for the normal distribution. You provide a value along the normal distribution (eg x=3) or array of values, and it returns what proportion of values lie below it (the quantile) Alternatively, if you select lowertail=False, it returns the proportion of values that are above it. USAGE: cnorm(mean=0, sd=1, type="equal", conf=0.95) dnorm(x, mean=0, sd=1, log=False) pnorm(q, mean=0, sd=1, lowertail=True, log=False) qnorm(p, mean=0, sd=1, lowertail=True, log=False) rnorm(n=1, mean=0, sd=1) :param x (float, array of floats): The values along the distribution. :param mean (float): mean of the distribution :param sd (float): standard deviation :param lowertail (bool): are you interested in what proportion of values lie beneath x? or above x (false)? :param log (bool): take the log? :return: an array of quantiles() corresponding to the values in x ============================================================================ """ if lowertail and not log: return norm.cdf(x=q, loc=mean, scale=sd) elif not lowertail and not log: return norm.sf(x=q, loc=mean, scale=sd) elif lowertail and log: return norm.logcdf(x=q, loc=mean, scale=sd) else: return norm.logsf(x=q, loc=mean, scale=sd)
def pnorm(x, mean=0, sd=1, lowertail=True, log=False): """ ============================================================================ pnorm() ============================================================================ The cumulative distribution function for the normal distribution. You provide a value along the normal distribution (eg x=3) or array of values, and it returns what proportion of values lie below it (the quantile) Alternatively, if you select lowertail=False, it returns the proportion of values that are above it. USAGE: cnorm(mean=0, sd=1, type="equal", conf=0.95) dnorm(x, mean=0, sd=1, log=False) pnorm(q, mean=0, sd=1, lowertail=True, log=False) qnorm(p, mean=0, sd=1, lowertail=True, log=False) rnorm(n=1, mean=0, sd=1) :param x (float, array of floats): The values along the distribution. :param mean (float): mean of the distribution :param sd (float): standard deviation :param lowertail (bool): are you interested in what proportion of values lie beneath x? or above x (false)? :param log (bool): take the log? :return: an array of quantiles() corresponding to the values in x ============================================================================ """ if lowertail and not log: return norm.cdf(x, loc=mean, scale=sd) elif not lowertail and not log: return norm.sf(x, loc=mean, scale=sd) elif lowertail and log: return norm.logcdf(x, loc=mean, scale=sd) else: return norm.logsf(x, loc=mean, scale=sd)
def Zscore_to_Pvalue(zscore): """Function that converts zscores to pvalues""" abs_zscore = np.absolute(zscore) pvalue = -1 * (norm.logsf(abs_zscore) / math.log(10)) return pvalue
import tqdm import itertools from scipy.stats import norm ACCURACY = 2**-300 ####################### # error computation # ####################### delta = lambda n, q, sd: 1 - erf( round(q / 4) / (sqrt(2) * sd**2 * sqrt(2 * n))) err = lambda n, q, sd, m: (1 - (1 - delta(n, q, sd))**m).n().log(2) delta2 = lambda n, q, sd, B: norm.logsf(float(round(q / 2 / 2**B)), scale=float(sd**2 * sqrt(2 * n)) ) / np.log(2) + 1 err2 = lambda n, q, sd, B, m: delta2(n, q, sd, B) + np.log2(m) ######################### # security estimation # ######################### try: load('https://bitbucket.org/malb/lwe-estimator/raw/HEAD/estimator.py') def est(n, q, sd): alpha = sqrt(2 * pi) * sd / RR(q) m = n secret_distribution = "normal" success_probability = 0.99
def log_sf(self, s): return norm.logsf(s, loc=self.mu, scale=self.std)
n = 468900 p = 0.00018 x = 244 from scipy.stats import norm the_scale = sqrt(n * p * (1 - p)) the_loc = n * p # survival function = 1 - cdf output = -norm.logsf(x, loc=the_loc, scale=the_scale) norm.logcdf(x, loc=the_loc, scale=the_scale) print(output)
def logG(self, t, x): Gamma_e = self.Gamma[t][0:t].reshape((t, 1)) x = x[:, 0:t] truncations = (-x @ Gamma_e + self.a[t]).reshape((len(x), )) return norm.logsf(truncations)
def test_pairs(self): mean_ins = 142 std_ins = 81.5431220178 max_ins = 386 in_bam = tk_bam.create_bam_infile(PAIR_FILE) for idx, read in enumerate(in_bam): if idx % 2 == 0: read1 = read else: read2 = read rp = tk_readpairs.ReadPair(read1, read2) if idx == 1: # HISEQ-002:203:HBERLADXX:2:1109:13682:22490 self.assertEqual(tk_readpairs.map_qstart(read1), 0) self.assertEqual(tk_readpairs.map_qstart(read2), 0) self.assertEqual(rp.sv_type, tk_readpairs.DEL_STR) range1, range2 = rp.get_break_ranges(max_ins) # Maximum distance between reads is max_ins - 2 * 88 self.assertEqual( range1, (189704422 + 88, 189704422 + max_ins - 88 + 1)) self.assertEqual( range2, (189783402 - max_ins + 2 * 88, 189783402 + 1)) if idx == 13: # HISEQ-002:203:HBERLADXX:2:2201:3706:93604 self.assertEqual(tk_readpairs.map_qstart(read1), 0) self.assertEqual(tk_readpairs.map_qstart(read2), 0) self.assertEqual(rp.sv_type, tk_readpairs.INV_STR) range1, range2 = rp.get_break_ranges(max_ins) self.assertEqual( range1, (162503343 - max_ins + 2 * 88, 162503343 + 1)) self.assertEqual( range2, (162627709 - max_ins + 2 * 88, 162627709 + 1)) self.assertEqual( tk_readpairs.get_pair_break_dist(read1, 162503243), 188) self.assertEqual( tk_readpairs.get_pair_break_dist(read2, 162627609), 154) if idx == 19: # HISEQ-002:203:HBERLADXX:1:2214:1366:24373 self.assertEqual(tk_readpairs.map_qstart(read1), 0) self.assertEqual(tk_readpairs.map_qstart(read2), 0) self.assertEqual(rp.sv_type, tk_readpairs.DEL_STR) range1, range2 = rp.get_break_ranges(max_ins) # +8 because this read has 8bp of deletions with respect to the reference # so the last aligned position on the reference is start + 88 + 8 self.assertEqual( range1, (78967094 + 88 + 8, 78967094 + max_ins - 88 + 1 + 8)) self.assertEqual( range2, (79036484 - max_ins + 2 * 88, 79036484 + 1)) self.assertEqual( tk_readpairs.get_pair_break_dist(read1, 78967294), 200) self.assertEqual( tk_readpairs.get_pair_break_dist(read2, 79036284), 288) # The ranges are single points. range2[1] is outside the valid range. probs = rp.get_break_lr( np.arange(range1[0], range1[1]), np.ones((range1[1] - range1[0], )) * range2[1], 0, lambda x: 0.0) assert (np.all(probs == 0)) # probabilities returned should be decreasing because we "used up" all the # insert size on the side of read2 probs = rp.get_break_lr( np.arange(range1[0], range1[1]), np.ones((range1[1] - range1[0], )) * range2[1], max_ins, lambda x: norm.logsf(x, mean_ins, std_ins)) self.assertEqual(list(probs), list(sorted(probs)[::-1])) # probabilities returned should be increasing because the mean insert size is # pretty large. probs = rp.get_break_lr( np.arange(range1[0], range1[1]), np.ones((range1[1] - range1[0], )) * range2[0], 10000, lambda x: norm.logsf(x, 10000, std_ins)) self.assertEqual(list(probs), list(sorted(probs))) in_bam.close()
def _logG(self, t: int, a_t: float, x: np.ndarray, assert_mode: int = 1): assert x.shape[1] == t + assert_mode Gamma_e = self.Gamma[t][0:t].reshape((t, 1)) x = x[:, 0:t] truncations = (-x @ Gamma_e + a_t).reshape((len(x), )) return norm.logsf(truncations)
while True: i += 1 if i % 1000 == 0: print(i) rmse = K.proc_next_packet() if rmse == -1: break RMSEs.append(rmse) stop = time.time() print("Complete. Time elapsed: " + str(stop - start)) # Here we demonstrate how one can fit the RMSE scores to a log-normal distribution (useful for finding/setting a cutoff threshold \phi) from scipy.stats import norm benignSample = np.log(RMSEs[FMgrace + ADgrace + 1:100000]) logProbs = norm.logsf( np.log(RMSEs), np.mean(benignSample), np.std(benignSample) ) #Return value: log of (1-log(RMSEs): integral of probability density function) # plot the RMSE anomaly scores print("Plotting results") from matplotlib import pyplot as plt from matplotlib import cm plt.figure(figsize=(10, 5)) fig = plt.scatter(range(FMgrace + ADgrace + 1, len(RMSEs)), RMSEs[FMgrace + ADgrace + 1:], s=0.1, c=logProbs[FMgrace + ADgrace + 1:], cmap='RdYlGn') plt.yscale("log") #Change the scale of the axis to log plt.title("Anomaly Scores from Kitsune's Execution Phase") plt.ylabel("RMSE (log scaled)")
def sigma_loglikelihood(sigma): wp_loglikelihood = norm.logpdf(wp, wp_pred, sigma) bp_loglikelihood = norm.logsf(bp, bp_pred, sigma) return -(np.sum(wp_loglikelihood) + np.sum(bp_loglikelihood))
while True: i += 1 if i % 1000 == 0: print(i) rmse = K.proc_next_packet() if rmse == -1: break RMSEs.append(rmse) stop = time.time() print("Complete. Time elapsed: " + str(stop - start)) # Here we demonstrate how one can fit the RMSE scores to a log-normal distribution (useful for finding/setting a cutoff threshold \phi) from scipy.stats import norm benignSample = np.log(RMSEs[FMgrace + ADgrace + 1:100000]) logProbs = norm.logsf(np.log(RMSEs), np.mean(benignSample), np.std(benignSample)) # plot the RMSE anomaly scores print("Plotting results") from matplotlib import pyplot as plt from matplotlib import cm plt.figure(figsize=(10, 5)) fig = plt.scatter(range(FMgrace + ADgrace + 1, len(RMSEs)), RMSEs[FMgrace + ADgrace + 1:], s=0.1, c=logProbs[FMgrace + ADgrace + 1:], cmap='RdYlGn') plt.yscale("log") plt.title("Anomaly Scores from Kitsune's Execution Phase") plt.ylabel("RMSE (log scaled)")
print(i + 1) rm, lt = C.process(X[i, ]) rms.append(rm) lts.append(lt) stop = time.time() print("Complete. Time elapsed: " + str(stop - start)) prms = np.array(rms[200000:]) plts = np.array(lts[200000:]) scores = np.zeros(400000) scores[:100000] = 2 * np.exp(10 * prms[:100000]) scores[100000:] = np.exp(10 * prms[100000:]) + np.exp(10 * plts[100000:]) index = np.array(range(len(scores))) benignSample = np.log(scores[:50000]) logProbs = norm.logsf(np.log(scores), np.mean(benignSample), np.std(benignSample)) fig3 = plt.figure(figsize=(12.8, 6.4)) plt.scatter(index, scores, s=4, c=logProbs, cmap='RdYlGn') plt.ylim([min(scores), max(scores) + 1.5]) plt.annotate('Normal Traffic', xy=(index[26000], 3), xytext=(index[0], max(scores)), arrowprops=dict(facecolor='black', shrink=0.005), fontsize='large') plt.annotate('DDoS Attack Traffic', xy=(index[100000], max(scores)), xytext=(index[0], max(scores) + 1), arrowprops=dict(facecolor='black', shrink=0.005), fontsize='large')