def _approximate_dataprob_correction(self, sample_size): ''' ad hoc approximation, see `python derivations/clustering.py dataprob` see `python derivations/clustering.py approximations` ''' n = log(sample_size) N = log(self.dataset_size) return 0.061 * n * (n - N) * (n + N) ** 0.75
def score_data(self, shared): """computes the joint p(q, Y)""" prior = sp.stats.beta.logpdf(self.p, shared.alpha, shared.beta) if self.p >= 0. and self.p <= 1.: likelihood = self.heads * \ log(self.p) + self.tails * log(1. - self.p) else: likelihood = -np.inf return prior + likelihood
def score_group(self, group): """ \cite{murphy2007conjugate}, Eq. 171 """ post = self.plus_group(group) return gammaln(post.nu / 2.) - gammaln(self.nu / 2.) \ + 0.5 * log(self.kappa / post.kappa) \ + (0.5 * self.nu) * log(self.nu * self.sigmasq) \ - (0.5 * post.nu) * log(post.nu * post.sigmasq) \ - group.count / 2. * 1.1447298858493991
def score_data(self, shared): """ \cite{murphy2007conjugate}, Eq. 171 """ post = shared.plus_group(self) return gammaln(post.nu / 2.) - gammaln(shared.nu / 2.) \ + 0.5 * log(shared.kappa / post.kappa) \ + (0.5 * shared.nu) * log(shared.nu * shared.sigmasq) \ - (0.5 * post.nu) * log(post.nu * post.sigmasq) \ - self.count / 2. * 1.1447298858493991
def score_student_t(x, nu, mu, sigmasq): """ \cite{murphy2007conjugate}, Eq. 304 """ score = gammaln(.5 * (nu + 1.)) - gammaln(.5 * nu) score -= .5 * log(nu * pi * sigmasq) xt = (x - mu) s = xt * xt / sigmasq score += -(.5 * (nu + 1.)) * log(1. + s / nu) return score
def score_value(self, shared, value): """ \cite{wallach2009rethinking} Eqn 4. McCallum, et. al, 'Rething LDA: Why Priors Matter' """ numer = self.counts[value] + shared.alphas[value] denom = self.counts.sum() + shared.alphas.sum() return log(numer / denom)
def score_value(self, shared, value): """ \cite{wallach2009rethinking} Eqn 4. McCallum, et. al, 'Rething LDA: Why Priors Matter' """ heads = shared.alpha + self.heads tails = shared.beta + self.tails numer = heads if value else tails denom = heads + tails return log(numer / denom)
def score_value(self, group, value): """ Adapted from dd.py, which was adapted from: McCallum, et. al, 'Rethinking LDA: Why Priors Matter' eqn 4 """ denom = self.alpha + group.total if value == OTHER: numer = self.beta0 * self.alpha else: numer = self.betas[value] * self.alpha + group.counts.get(value, 0) return log(numer / denom)
def score_add_value( self, group_size, nonempty_group_count, sample_size, empty_group_count=1): ''' Return log of posterior predictive probability given sufficient statistics of a partial assignments vector [X_0,...,X_{n-1}] log P[ X_n = k | X_0=x_0, ..., X_{n-1}=x_{n-1} ] where group_size = #{i | x_i = k, i in {0,...,n-1}} nonempty_group_count = #{x_i | i in {0,...,n-1}} sample_size = n and empty_group_count is the number of empty groups that are uniformly competing for the assignment. Typically empty_group_count = 1, but multiple empty "ephemeral" groups are used in e.g. Radford Neal's Algorithm-8 \cite{neal2000markov}. ''' assert sample_size < self.dataset_size assert 0 < empty_group_count if group_size == 0: score = -log(empty_group_count) if sample_size + 1 < self.dataset_size: score += self._approximate_postpred_correction(sample_size + 1) return score # see `python derivations/clustering.py fastlog` very_large = 10000 bigger = 1.0 + group_size if group_size > very_large: return 1.0 + log(bigger) else: return log(bigger / group_size) * group_size + log(bigger)
def _approximate_postpred_correction(self, sample_size): ''' ad hoc approximation, see `python derivations/clustering.py postpred` see `python derivations/clustering.py approximations` ''' assert 0 < sample_size assert sample_size < self.dataset_size exponent = 0.45 - 0.1 / sample_size - 0.1 / self.dataset_size scale = self.dataset_size / sample_size return log(scale) * exponent
def score_value(self, shared, value): """ Adapted from dd.py, which was adapted from: McCallum, et. al, 'Rethinking LDA: Why Priors Matter' eqn 4 """ denom = shared.alpha + self.total if value == OTHER: numer = shared.beta0 * shared.alpha else: count = self.counts.get(value, 0) assert count >= 0, "cannot score while in debt" numer = shared.betas[value] * shared.alpha + count return log(numer / denom)
def log_partition_function(self, sample_size): ''' Computes log_sum_exp( sum(n * log(n) for n in partition) for partition in partitions(sample_size) ) exactly for small n, and approximately for large n. ''' # TODO incorporate dataset_size for higher accuracy n = sample_size if n < 48: return LowEntropy.log_partition_function_table[n] else: coeff = 0.28269584 log_z_max = n * log(n) return log_z_max * (1.0 + coeff * n ** -0.75)
def score_counts(self, counts): ''' Return log probability of data, given sufficient statistics of a partial assignment vector [X_0,...,X_{n-1}] log P[ X_0=x_0, ..., X_{n-1}=x_{n-1} ] ''' score = 0.0 sample_size = 0 for count in counts: sample_size += count if count > 1: score += count * log(count) assert sample_size <= self.dataset_size if sample_size != self.dataset_size: log_factor = self._approximate_postpred_correction(sample_size) score += log_factor * (len(counts) - 1) score += self._approximate_dataprob_correction(sample_size) score -= self.log_partition_function(sample_size) return score
def score_value(self, shared, value): """samples a value using the explicit p""" return log(self.p) if value else log(1. - self.p)
def remove_value(self, shared, value): self.count -= 1 self.sum -= int(value) self.log_prod -= log(factorial(value))
def add_repeated_value(self, shared, value, count): self.count += count self.sum += int(count * value) self.log_prod += count * log(factorial(value))
def add_value(self, shared, value): self.count += 1 self.sum += int(value) self.log_prod += log(factorial(value))
def score_data(self, shared): post = shared.plus_group(self) return gammaln(post.alpha) - gammaln(shared.alpha) \ - post.alpha * log(post.inv_beta) \ + shared.alpha * log(shared.inv_beta) \ - self.log_prod
def score_value(self, shared, value): post = shared.plus_group(self) return gammaln(post.alpha + value) - gammaln(post.alpha) \ + post.alpha * log(post.inv_beta) \ - (post.alpha + value) * log(1. + post.inv_beta) \ - log(factorial(value))