def _split(self, data, constrained_clustering=None): particle = self.kernel.create_particle(0, data[0], None, log_q={0: 0}) particle = self.kernel.create_particle(1, data[1], particle, log_q={1: 0}) if constrained_clustering is None: for data_point in data[2:]: particle = self.kernel.propose(data_point, particle) else: constrained_clustering = relabel_clustering(constrained_clustering) for block_idx, data_point in zip(constrained_clustering[2:], data[2:]): particle = self.kernel.create_particle(block_idx, data_point, particle) clustering = get_cluster_labels(particle) init_params = [ self.dist.create_params_from_data(data[0]), self.dist.create_params_from_data(data[1]), ] log_mh_factor = get_log_normalisation( particle) + self.kernel.log_target_density(init_params) return clustering, log_mh_factor
def _get_updated_clustering(self, clustering, particle, sigma): restricted_clustering = get_cluster_labels(particle) max_idx = clustering.max() clustering[sigma] = restricted_clustering + max_idx + 1 return relabel_clustering(clustering)
def update(self, clustering): clustering = relabel_clustering(clustering) clusters = np.unique(clustering) num_clusters = len(np.unique(clustering)) self.cluster_probs = np.zeros((num_clusters, num_clusters)) self.clusters_to_data = {} self.data_to_clusters = {} margs = {} for c in clusters: cluster_data = self.data[clustering == c] cluster_params = self.dist.create_params_from_data(cluster_data) margs[c] = self.dist.log_marginal_likelihood(cluster_params) if self.use_prior_weight: margs[c] += self.partition_prior.log_tau_2(cluster_params.N) self.clusters_to_data[c] = np.where(clustering == c)[0].flatten() for i in self.clusters_to_data[c]: self.data_to_clusters[i] = c for c_i in clusters: log_p = np.ones(num_clusters) * float('-inf') for c_j in clusters: if c_i == c_j: continue merged_data = self.data[(clustering == c_i) | (clustering == c_j)] merged_params = self.dist.create_params_from_data(merged_data) merge_marg = self.dist.log_marginal_likelihood(merged_params) if self.use_prior_weight: merge_marg += self.partition_prior.log_tau_2( merged_params.N) log_p[c_j] = merge_marg - (margs[c_i] + margs[c_j]) if num_clusters == 1: log_p[c_i] = 0 else: log_p[c_i] = -np.log(num_clusters - 1) + log_sum_exp(log_p) self.cluster_probs[c_i], _ = exp_normalize(log_p)
def _sample(self, clustering, data): anchors, sigma = self.split_merge_setup_kernel.setup_split_merge( clustering, 2) self.kernel.setup(anchors, clustering, data, sigma, set_constrained_path=False) clustering_sigma = clustering[sigma] data_sigma = data[sigma] propose_merge = (clustering_sigma[0] != clustering_sigma[1]) if propose_merge: merge_clustering, merge_mh_factor = self._merge(data_sigma) split_clustering, split_mh_factor = self._split( data_sigma, constrained_clustering=clustering_sigma) forward_factor = merge_mh_factor reverse_factor = split_mh_factor restricted_clustering = merge_clustering else: merge_clustering, merge_mh_factor = self._merge(data_sigma) split_clustering, split_mh_factor = self._split(data_sigma) forward_factor = split_mh_factor reverse_factor = merge_mh_factor restricted_clustering = split_clustering log_ratio = forward_factor - reverse_factor # print split_mh_factor, merge_mh_factor, log_ratio u = np.random.random() if log_ratio >= np.log(u): max_idx = clustering.max() clustering[sigma] = restricted_clustering + max_idx + 1 clustering = relabel_clustering(clustering) return clustering
def get_constrained_path(clustering, data, kernel): constrained_path = [] clustering = relabel_clustering(clustering) particle = None for c, x in zip(clustering, data): particle = kernel.create_particle(c, x, particle) constrained_path.append(particle) return constrained_path
def sample(self, clustering, data, num_iters=1): for _ in range(num_iters): anchors, sigma = self._setup_split_merge(clustering) self.smc_kernel.setup(anchors, clustering, data, sigma) particles_weights = self.smc_sampler.sample( data[sigma], self.smc_kernel) sampled_particle = self._sample_particle(particles_weights) self._get_updated_clustering(clustering, sampled_particle, sigma) clustering = relabel_clustering(clustering) return clustering
def get_exact_posterior(data, dist, partition_prior): ''' Compute the exact posterior of the clustering model. Returns a dictionary mapping clusterings to posterior probability. ''' log_p = [] clusterings = [] for c in get_all_clusterings(data.shape[0]): clusterings.append(tuple(relabel_clustering(c).astype(int))) log_p.append(log_joint_probability(c, data, dist, partition_prior)) p, _ = exp_normalize(np.array(log_p)) return dict(zip(clusterings, p))
def update(self, clustering): self.cluster_params = {} self.clusters_to_data = {} self.data_to_clusters = {} self.clustering = relabel_clustering(clustering) for c in np.unique(clustering): cluster_data = self.data[clustering == c] self.cluster_params[c] = self.dist.create_params() for data_point in cluster_data: self.cluster_params[c].increment(data_point) self.clusters_to_data[c] = np.where(clustering == c)[0].flatten()
def _run_sampler_posterior(self, data, sampler, burnin=int(1e2), num_iters=int(1e4)): clustering = np.zeros(data.shape[0], dtype=int) test_counts = Counter() for i in range(num_iters): clustering = sampler.sample(clustering, data) if i >= burnin: test_counts[tuple(relabel_clustering(clustering))] += 1 posterior_probs = defaultdict(float) norm_const = sum(test_counts.values()) for key in test_counts: posterior_probs[key] = test_counts[key] / norm_const return posterior_probs