def sample(self, data, partition, alpha): for item, data_point in enumerate(data): old_cell_index = partition.labels[item] partition.remove_item(item, old_cell_index) partition.remove_empty_cells() log_p = [] for cell in partition.cells: cluster_log_p = self.cluster_density.log_p(data_point, cell.value) counts = cell.size log_p.append(log(counts) + cluster_log_p) params = self.base_measure.params cluster_log_p = self.posterior_density.log_p(data_point, params) log_p.append(log(alpha) + cluster_log_p) log_p = log_space_normalise(log_p) p = [exp(x) for x in log_p] new_cell_index = discrete_rvs(p) if new_cell_index == partition.number_of_cells: partition.add_cell(self.base_measure.random()) partition.add_item(item, new_cell_index)
def _split(self, i, j, old_cell, data, partition): old_cell.remove_item(i) old_cell.remove_item(j) param_i = old_cell.value param_j = self.proposal_func.random(param_i) forward_log_q = self.proposal_func.log_p(param_i, param_i) + self.proposal_func.log_p(param_j, param_i) reverse_log_q = self.proposal_func.log_p(param_i, param_i) new_cell_i = partition.add_cell(param_i) new_cell_j = partition.add_cell(param_j) new_cell_i.add_item(i) new_cell_j.add_item(j) s = old_cell.items shuffle(s) for k in s: old_cell.remove_item(k) n_i = new_cell_i.size n_j = new_cell_j.size log_p = [ log(n_i) + self.cluster_density.log_p(data[k], param_i), log(n_j) + self.cluster_density.log_p(data[k], param_j) ] log_p = log_space_normalise(log_p) p = [exp(x) for x in log_p] c_k = discrete_rvs(p) if c_k == 0: new_cell_i.add_item(k) else: new_cell_j.add_item(k) forward_log_q += log_p[c_k] partition.remove_empty_cells() return new_cell_i, new_cell_j, forward_log_q, reverse_log_q
def sample(self, data, partition, alpha, m=2): ''' Sample a new partition according to algorithm 8 of Neal "Sampling Methods For Dirichlet Process Mixture Models" ''' items = range(len(data)) shuffle(items) for item in items: data_point = data[item] old_cell_index = partition.get_cell_index(item) partition.remove_item(item, old_cell_index) if partition.counts[old_cell_index] == 0: num_new_tables = m - 1 else: num_new_tables = m for _ in range(num_new_tables): partition.add_cell(self.base_measure.random()) log_p = [] for cell in partition.cells: cluster_log_p = self.cluster_density.log_p(data_point, cell.value) counts = cell.size if counts == 0: counts = alpha / m log_p.append(log(counts) + cluster_log_p) log_p = log_space_normalise(log_p) p = [exp(x) for x in log_p] new_cell_index = discrete_rvs(p) partition.add_item(item, new_cell_index) partition.remove_empty_cells()
def sample(self, data, partition, alpha, m=2): ''' Sample a new partition according to algorithm 8 of Neal "Sampling Methods For Dirichlet Process Mixture Models" ''' items = range(len(data)) shuffle(items) for item in items: data_point = data[item] old_cell_index = partition.labels[item] partition.remove_item(item, old_cell_index) if partition.counts[old_cell_index] == 0: num_new_tables = m - 1 else: num_new_tables = m for _ in range(num_new_tables): partition.add_cell(self.base_measure.random()) log_p = [] for cell in partition.cells: cluster_log_p = self.cluster_density.log_p(data_point, cell.value) counts = cell.size if counts == 0: counts = alpha / m log_p.append(log(counts) + cluster_log_p) log_p = log_space_normalise(log_p) p = [exp(x) for x in log_p] new_cell_index = discrete_rvs(p) partition.add_item(item, new_cell_index) partition.remove_empty_cells()
def sample_from_crp(alpha, size, base_measure): labels = [] values = [] tables = [] # Seat the first customer tables.append([ 0, ]) labels.append(0) values.append(base_measure.random()) for customer in range(1, size): p = _get_table_probabilities(tables, alpha) table_id = discrete_rvs(p) if table_id == len(tables): tables.append([ customer, ]) values.append(base_measure.random()) else: tables[table_id].append(customer) labels.append(table_id) partition = Partition() for v in values: partition.add_cell(v) for item, cell_index in enumerate(labels): partition.add_item(item, cell_index) return partition
def sample(self, old_value, num_clusters, num_data_points): a = self.a b = self.b k = num_clusters n = num_data_points eta = beta_rvs(old_value + 1, n) x = (a + k - 1) / (n * (b - log(eta))) pi = x / (1 + x) label = discrete_rvs([pi, 1 - pi]) scale = b - log(eta) if label == 0: new_value = gamma_rvs(a + k, scale) else: new_value = gamma_rvs(a + k - 1, scale) return new_value
def sample(self, data, partition, alpha): n = partition.number_of_items for item, data_point in enumerate(data): old_cluster_label = partition.labels[item] old_value = partition.item_values[item] partition.remove_item(item, old_cluster_label) if partition.counts[old_cluster_label] == 0: p = [x / (n - 1) for x in partition.counts] new_cluster_label = discrete_rvs(p) new_value = partition.cell_values[new_cluster_label] old_ll = self.cluster_density.log_p(data_point, old_value) new_ll = self.cluster_density.log_p(data_point, new_value) log_ratio = log(n - 1) - log(alpha) + new_ll - old_ll u = uniform_rvs(0, 1) if log_ratio >= log(u): partition.add_item(item, new_cluster_label) else: partition.add_item(item, old_cluster_label) else: new_value = self.base_measure.random() old_ll = self.cluster_density.log_p(data_point, old_value) new_ll = self.cluster_density.log_p(data_point, new_value) log_ratio = log(alpha) - log(n - 1) + new_ll - old_ll u = uniform_rvs(0, 1) if log_ratio >= log(u): partition.add_cell(new_value) cell = partition.get_cell_by_value(new_value) cell.add_item(item) else: partition.add_item(item, old_cluster_label) partition.remove_empty_cells() for item, data_point in enumerate(data): old_cluster_label = partition.labels[item] if partition.cells[old_cluster_label].size == 1: continue partition.remove_item(item, old_cluster_label) log_p = [] for cell in partition.cells: cluster_log_p = self.cluster_density.log_p(data_point, cell.value) counts = cell.size log_p.append(log(counts) + cluster_log_p) log_p = log_space_normalise(log_p) p = [exp(x) for x in log_p] new_cluster_label = discrete_rvs(p) partition.add_item(item, new_cluster_label) partition.remove_empty_cells()