def log_add_by_margs(probability_distr): tot = 0 for row in probability_distr: tot += logsumexp(row) return tot
def __init__(self, hist_array, labels, phasing=None, ignore_nan=False): conv = 1e-40 # We need to noramlize the array to 1. This is hard. # This step does the following: # convert to log space after adding a small convolution paramter so we don't get INF and NAN # for each mutation # normalize the row to 1 in logspace. hist = np.asarray(hist_array, dtype=np.float32) + conv if (~(hist > 0)).any(): logging.error("Negative histogram bin or NAN mutation!") if ignore_nan: logging.warning( "Caught ignore nan flag, not exiting, setting nan values to zero" ) hist[np.logical_not(hist > 0)] = conv else: sys.exit(1) n_samples = np.shape(hist)[1] for sample in range(n_samples): hist[:, :, 0] = conv ##set zero bins self._hist_array = np.apply_over_axes( lambda x, y: np.apply_along_axis(lambda z: z - logsumexp(z), y, x), np.log(hist), 2) #### self._labels = labels self._label_ids = dict([[y, x] for x, y in enumerate(labels)]) self._phasing = {} if phasing is None else phasing self.n_samples = np.shape(hist_array)[1] self.n_bins = np.shape(hist_array)[-1]
def sselogsum(v): """ Fastest log sum exp on v array: https://github.com/rmcgibbo/logsumexp :param v: must be np.type32 """ return sselogsumexp.logsumexp(v)
def get_norm_marg_hist(log_sum, log_prior): # Normalize in each dimension log_f_LL = log_sum + log_prior log_f_post = [] for row in log_f_LL: log_f_post.append(row - logsumexp(row)) return np.array(log_f_post, dtype=np.float32)
def change_of_variables(x_in, y_in, x_out): ################################################################## # Function to do a change of variables for prob density function # ################################################################## # x_in -- list, original domain # y_in -- list, original pdf # x_out -- list, transformed domain (has to be same dim as x_in) # bin x_in delta_x_in = bin_x_and_calculate_delta(x_in) # calculate the integral of the function over all bins and renormalize y_in = np.array(y_in, dtype=np.float32) # y_in = y_in - logsumexp(y_in + np.log( delta_x_in,dtype=np.float32)) # for each bin, keep integral of each bin consistent, but transform widths according to transformation delta_x_out = bin_x_and_calculate_delta(x_out) y_out = np.array(y_in + np.log(delta_x_in) - np.log(delta_x_out), dtype=np.float32) return y_out - logsumexp(y_out)
def offline_changepoint_detection(data, prior_func, observation_log_likelihood_function, truncate=-np.inf): """Compute the likelihood of changepoints on data. Keyword arguments: data -- the time series data prior_func -- a function given the likelihood of a changepoint given the distance to the last one observation_log_likelihood_function -- a function giving the log likelihood of a data part truncate -- the cutoff probability 10^truncate to stop computation for that changepoint log likelihood P -- the likelihoods if pre-computed """ n = len(data) Q = np.zeros((n, )) g = np.zeros((n, )) G = np.zeros((n, )) P = np.ones((n, n)) * -np.inf # save everything in log representation for t in range(n): g[t] = np.log(prior_func(t)) if t == 0: G[t] = g[t] else: G[t] = np.logaddexp(G[t - 1], g[t]) P[n - 1, n - 1] = observation_log_likelihood_function(data, n - 1, n) Q[n - 1] = P[n - 1, n - 1] for t in reversed(range(n - 1)): P_next_cp = -np.inf # == log(0) for s in range(t, n - 1): P[t, s] = observation_log_likelihood_function(data, t, s + 1) # compute recursion summand = P[t, s] + Q[s + 1] + g[s + 1 - t] P_next_cp = np.logaddexp(P_next_cp, summand) # truncate sum to become approx. linear in time (see # Fearnhead, 2006, eq. (3)) if summand - P_next_cp < truncate: break P[t, n - 1] = observation_log_likelihood_function(data, t, n) # (1 - G) is numerical stable until G becomes numerically 1 if G[n - 1 - t] < -1e-15: # exp(-1e-15) = .99999... antiG = np.log(1 - np.exp(G[n - 1 - t])) else: # (1 - G) is approx. -log(G) for G close to 1 antiG = np.log(-G[n - 1 - t]) Q[t] = np.logaddexp(P_next_cp, P[t, n - 1] + antiG) Pcp = np.ones((n - 1, n - 1)) * -np.inf for t in range(n - 1): Pcp[0, t] = P[0, t] + Q[t + 1] + g[t] - Q[0] if np.isnan(Pcp[0, t]): Pcp[0, t] = -np.inf for j in range(1, n - 1): for t in range(j, n - 1): tmp_cond = Pcp[j - 1, j - 1:t] + P[j:t + 1, t] + Q[t + 1] + g[0:t - j + 1] - Q[j:t + 1] Pcp[j, t] = logsumexp(tmp_cond.astype(np.float32)) if np.isnan(Pcp[j, t]): Pcp[j, t] = -np.inf return Q, P, Pcp
def offline_changepoint_detection(data, prior_func, observation_log_likelihood_function, truncate=-np.inf): """Compute the likelihood of changepoints on data. Keyword arguments: data -- the time series data prior_func -- a function given the likelihood of a changepoint given the distance to the last one observation_log_likelihood_function -- a function giving the log likelihood of a data part truncate -- the cutoff probability 10^truncate to stop computation for that changepoint log likelihood P -- the likelihoods if pre-computed """ n = len(data) Q = np.zeros((n,)) g = np.zeros((n,)) G = np.zeros((n,)) P = np.ones((n, n)) * -np.inf # save everything in log representation for t in range(n): g[t] = np.log(prior_func(t)) if t == 0: G[t] = g[t] else: G[t] = np.logaddexp(G[t-1], g[t]) P[n-1, n-1] = observation_log_likelihood_function(data, n-1, n) Q[n-1] = P[n-1, n-1] for t in reversed(range(n-1)): P_next_cp = -np.inf # == log(0) for s in range(t, n-1): P[t, s] = observation_log_likelihood_function(data, t, s+1) # compute recursion summand = P[t, s] + Q[s + 1] + g[s + 1 - t] P_next_cp = np.logaddexp(P_next_cp, summand) # truncate sum to become approx. linear in time (see # Fearnhead, 2006, eq. (3)) if summand - P_next_cp < truncate: break P[t, n-1] = observation_log_likelihood_function(data, t, n) # (1 - G) is numerical stable until G becomes numerically 1 if G[n-1-t] < -1e-15: # exp(-1e-15) = .99999... antiG = np.log(1 - np.exp(G[n-1-t])) else: # (1 - G) is approx. -log(G) for G close to 1 antiG = np.log(-G[n-1-t]) Q[t] = np.logaddexp(P_next_cp, P[t, n-1] + antiG) Pcp = np.ones((n-1, n-1)) * -np.inf for t in range(n-1): Pcp[0, t] = P[0, t] + Q[t + 1] + g[t] - Q[0] if np.isnan(Pcp[0, t]): Pcp[0, t] = -np.inf for j in range(1, n-1): for t in range(j, n-1): tmp_cond = Pcp[j-1, j-1:t] + P[j:t+1, t] + Q[t + 1] + g[0:t-j+1] - Q[j:t+1] Pcp[j, t] = logsumexp(tmp_cond.astype(np.float32)) if np.isnan(Pcp[j, t]): Pcp[j, t] = -np.inf return Q, P, Pcp
def normalize_loghist_with_prior(self, loghist): # Normalize in each dimension return np.apply_along_axis(lambda x: x - logsumexp(x), 1, loghist + self.logprior)
def logsum_of_marginals_per_sample(loghist): return np.apply_along_axis(lambda x: logsumexp(x), 1, np.array(loghist, dtype=np.float32))
def logsum_of_marginals(loghist): return np.sum(np.apply_along_axis(lambda x: logsumexp(x), 1, loghist))
def _normalize_loghist_with_prior(self, loghist): """ Normalize in each dimension in log space """ loghist = np.asarray(loghist, dtype=np.float32) return np.apply_along_axis(lambda x: x - logsumexp(x), 1, loghist + self._logprior)
def offline_changepoint_detection(data, prior_function, log_likelihood_class, truncate: int = -40): """ Compute the likelihood of changepoints on data. Parameters: data -- the time series data truncate -- the cutoff probability 10^truncate to stop computation for that changepoint log likelihood Outputs: P -- the log-likelihood of a datasequence [t, s], given there is no changepoint between t and s Q -- the log-likelihood of data Pcp -- the log-likelihood that the i-th changepoint is at time step t. To actually get the probility of a changepoint at time step t sum the probabilities. """ # Set up the placeholders for each parameter n = len(data) Q = np.zeros((n, )) g = np.zeros((n, )) G = np.zeros((n, )) P = np.ones((n, n)) * -np.inf # save everything in log representation for t in range(n): g[t] = prior_function(t) if t == 0: G[t] = g[t] else: G[t] = np.logaddexp(G[t - 1], g[t]) P[n - 1, n - 1] = log_likelihood_class.pdf(data, t=n - 1, s=n) Q[n - 1] = P[n - 1, n - 1] for t in reversed(range(n - 1)): P_next_cp = -np.inf # == log(0) for s in range(t, n - 1): P[t, s] = log_likelihood_class.pdf(data, t=t, s=s + 1) # compute recursion summand = P[t, s] + Q[s + 1] + g[s + 1 - t] P_next_cp = np.logaddexp(P_next_cp, summand) # truncate sum to become approx. linear in time (see # Fearnhead, 2006, eq. (3)) if summand - P_next_cp < truncate: break P[t, n - 1] = log_likelihood_class.pdf(data, t=t, s=n) # (1 - G) is numerical stable until G becomes numerically 1 if G[n - 1 - t] < -1e-15: # exp(-1e-15) = .99999... antiG = np.log(1 - np.exp(G[n - 1 - t])) else: # (1 - G) is approx. -log(G) for G close to 1 antiG = np.log(-G[n - 1 - t]) Q[t] = np.logaddexp(P_next_cp, P[t, n - 1] + antiG) Pcp = np.ones((n - 1, n - 1)) * -np.inf for t in range(n - 1): Pcp[0, t] = P[0, t] + Q[t + 1] + g[t] - Q[0] if np.isnan(Pcp[0, t]): Pcp[0, t] = -np.inf for j in range(1, n - 1): for t in range(j, n - 1): tmp_cond = (Pcp[j - 1, j - 1:t] + P[j:t + 1, t] + Q[t + 1] + g[0:t - j + 1] - Q[j:t + 1]) Pcp[j, t] = logsumexp(tmp_cond.astype(np.float32)) if np.isnan(Pcp[j, t]): Pcp[j, t] = -np.inf return Q, P, Pcp