Example #1
0
    def sample(self, data, partition, alpha):
        for item, data_point in enumerate(data):
            old_cell_index = partition.labels[item]
            
            partition.remove_item(item, old_cell_index)
            
            partition.remove_empty_cells()
            
            log_p = []    
            
            for cell in partition.cells:
                cluster_log_p = self.cluster_density.log_p(data_point, cell.value)
                
                counts = cell.size
                
                log_p.append(log(counts) + cluster_log_p)

            params = self.base_measure.params
    
            cluster_log_p = self.posterior_density.log_p(data_point, params)
            
            log_p.append(log(alpha) + cluster_log_p)
            
            log_p = log_space_normalise(log_p)    
            
            p = [exp(x) for x in log_p]
            
            new_cell_index = discrete_rvs(p)
            
            if new_cell_index == partition.number_of_cells:
                partition.add_cell(self.base_measure.random())
                
            partition.add_item(item, new_cell_index)
Example #2
0
    def sample(self, data, partition, alpha):
        for item, data_point in enumerate(data):
            old_cell_index = partition.labels[item]

            partition.remove_item(item, old_cell_index)

            partition.remove_empty_cells()

            log_p = []

            for cell in partition.cells:
                cluster_log_p = self.cluster_density.log_p(data_point, cell.value)

                counts = cell.size

                log_p.append(log(counts) + cluster_log_p)

            params = self.base_measure.params

            cluster_log_p = self.posterior_density.log_p(data_point, params)

            log_p.append(log(alpha) + cluster_log_p)

            log_p = log_space_normalise(log_p)

            p = [exp(x) for x in log_p]

            new_cell_index = discrete_rvs(p)

            if new_cell_index == partition.number_of_cells:
                partition.add_cell(self.base_measure.random())

            partition.add_item(item, new_cell_index)
Example #3
0
    def _split(self, i, j, old_cell, data, partition):
        old_cell.remove_item(i)
        old_cell.remove_item(j)

        param_i = old_cell.value
        param_j = self.proposal_func.random(param_i)

        forward_log_q = self.proposal_func.log_p(param_i, param_i) + self.proposal_func.log_p(param_j, param_i)
        reverse_log_q = self.proposal_func.log_p(param_i, param_i)

        new_cell_i = partition.add_cell(param_i)
        new_cell_j = partition.add_cell(param_j)

        new_cell_i.add_item(i)
        new_cell_j.add_item(j)

        s = old_cell.items
        shuffle(s)

        for k in s:
            old_cell.remove_item(k)

            n_i = new_cell_i.size
            n_j = new_cell_j.size

            log_p = [
                log(n_i) + self.cluster_density.log_p(data[k], param_i),
                log(n_j) + self.cluster_density.log_p(data[k], param_j)
            ]

            log_p = log_space_normalise(log_p)

            p = [exp(x) for x in log_p]

            c_k = discrete_rvs(p)

            if c_k == 0:
                new_cell_i.add_item(k)
            else:
                new_cell_j.add_item(k)

            forward_log_q += log_p[c_k]

        partition.remove_empty_cells()

        return new_cell_i, new_cell_j, forward_log_q, reverse_log_q
Example #4
0
    def _split(self, i, j, old_cell, data, partition):
        old_cell.remove_item(i)        
        old_cell.remove_item(j)
        
        param_i = old_cell.value      
        param_j = self.proposal_func.random(param_i)
        
        forward_log_q = self.proposal_func.log_p(param_i, param_i) + self.proposal_func.log_p(param_j, param_i)
        reverse_log_q = self.proposal_func.log_p(param_i, param_i)
        
        new_cell_i = partition.add_cell(param_i)        
        new_cell_j = partition.add_cell(param_j)
        
        new_cell_i.add_item(i)
        new_cell_j.add_item(j)
        
        s = old_cell.items
        shuffle(s)
        
        for k in s:
            old_cell.remove_item(k)
            
            n_i = new_cell_i.size
            n_j = new_cell_j.size
            
            log_p = [
                     log(n_i) + self.cluster_density.log_p(data[k], param_i),
                     log(n_j) + self.cluster_density.log_p(data[k], param_j)
                     ]
            
            log_p = log_space_normalise(log_p)
            
            p = [exp(x) for x in log_p]
            
            c_k = discrete_rvs(p)
            
            if c_k == 0:                
                new_cell_i.add_item(k)
            else:
                new_cell_j.add_item(k)
            
            forward_log_q += log_p[c_k]
        
        partition.remove_empty_cells()

        return new_cell_i, new_cell_j, forward_log_q, reverse_log_q
Example #5
0
    def sample(self, data, partition, alpha, m=2):
        '''
        Sample a new partition according to algorithm 8 of Neal "Sampling Methods For Dirichlet Process Mixture Models"
        '''
        items = range(len(data))

        shuffle(items)

        for item in items:
            data_point = data[item]

            old_cell_index = partition.get_cell_index(item)

            partition.remove_item(item, old_cell_index)

            if partition.counts[old_cell_index] == 0:
                num_new_tables = m - 1
            else:
                num_new_tables = m

            for _ in range(num_new_tables):
                partition.add_cell(self.base_measure.random())

            log_p = []

            for cell in partition.cells:
                cluster_log_p = self.cluster_density.log_p(data_point, cell.value)

                counts = cell.size

                if counts == 0:
                    counts = alpha / m

                log_p.append(log(counts) + cluster_log_p)

            log_p = log_space_normalise(log_p)

            p = [exp(x) for x in log_p]

            new_cell_index = discrete_rvs(p)

            partition.add_item(item, new_cell_index)

            partition.remove_empty_cells()
Example #6
0
 def sample(self, data, partition, alpha, m=2):
     '''
     Sample a new partition according to algorithm 8 of Neal "Sampling Methods For Dirichlet Process Mixture Models"
     '''
     items = range(len(data))
     
     shuffle(items)
     
     for item in items:
         data_point = data[item]
         
         old_cell_index = partition.labels[item]
         
         partition.remove_item(item, old_cell_index)
         
         if partition.counts[old_cell_index] == 0:
             num_new_tables = m - 1
         else:
             num_new_tables = m
         
         for _ in range(num_new_tables):
             partition.add_cell(self.base_measure.random())
         
         log_p = []
         
         for cell in partition.cells:
             cluster_log_p = self.cluster_density.log_p(data_point, cell.value)
             
             counts = cell.size
             
             if counts == 0:
                 counts = alpha / m
             
             log_p.append(log(counts) + cluster_log_p)
 
         log_p = log_space_normalise(log_p)
         
         p = [exp(x) for x in log_p]
         
         new_cell_index = discrete_rvs(p)
         
         partition.add_item(item, new_cell_index)
         
         partition.remove_empty_cells()
Example #7
0
def sample_from_crp(alpha, size, base_measure):
    labels = []
    values = []

    tables = []

    # Seat the first customer
    tables.append([
        0,
    ])

    labels.append(0)
    values.append(base_measure.random())

    for customer in range(1, size):
        p = _get_table_probabilities(tables, alpha)

        table_id = discrete_rvs(p)

        if table_id == len(tables):
            tables.append([
                customer,
            ])

            values.append(base_measure.random())
        else:
            tables[table_id].append(customer)

        labels.append(table_id)

    partition = Partition()

    for v in values:
        partition.add_cell(v)

    for item, cell_index in enumerate(labels):
        partition.add_item(item, cell_index)

    return partition
Example #8
0
    def sample(self, old_value, num_clusters, num_data_points):
        a = self.a
        b = self.b

        k = num_clusters
        n = num_data_points

        eta = beta_rvs(old_value + 1, n)

        x = (a + k - 1) / (n * (b - log(eta)))

        pi = x / (1 + x)

        label = discrete_rvs([pi, 1 - pi])

        scale = b - log(eta)

        if label == 0:
            new_value = gamma_rvs(a + k, scale)
        else:
            new_value = gamma_rvs(a + k - 1, scale)

        return new_value
Example #9
0
    def sample(self, data, partition, alpha):
        n = partition.number_of_items

        for item, data_point in enumerate(data):
            old_cluster_label = partition.labels[item]
            old_value = partition.item_values[item]

            partition.remove_item(item, old_cluster_label)

            if partition.counts[old_cluster_label] == 0:
                p = [x / (n - 1) for x in partition.counts]

                new_cluster_label = discrete_rvs(p)

                new_value = partition.cell_values[new_cluster_label]

                old_ll = self.cluster_density.log_p(data_point, old_value)
                new_ll = self.cluster_density.log_p(data_point, new_value)

                log_ratio = log(n - 1) - log(alpha) + new_ll - old_ll

                u = uniform_rvs(0, 1)

                if log_ratio >= log(u):
                    partition.add_item(item, new_cluster_label)
                else:
                    partition.add_item(item, old_cluster_label)

            else:
                new_value = self.base_measure.random()

                old_ll = self.cluster_density.log_p(data_point, old_value)
                new_ll = self.cluster_density.log_p(data_point, new_value)

                log_ratio = log(alpha) - log(n - 1) + new_ll - old_ll

                u = uniform_rvs(0, 1)

                if log_ratio >= log(u):
                    partition.add_cell(new_value)

                    cell = partition.get_cell_by_value(new_value)

                    cell.add_item(item)
                else:
                    partition.add_item(item, old_cluster_label)

        partition.remove_empty_cells()

        for item, data_point in enumerate(data):
            old_cluster_label = partition.labels[item]

            if partition.cells[old_cluster_label].size == 1:
                continue

            partition.remove_item(item, old_cluster_label)

            log_p = []

            for cell in partition.cells:
                cluster_log_p = self.cluster_density.log_p(data_point, cell.value)

                counts = cell.size

                log_p.append(log(counts) + cluster_log_p)

            log_p = log_space_normalise(log_p)

            p = [exp(x) for x in log_p]

            new_cluster_label = discrete_rvs(p)

            partition.add_item(item, new_cluster_label)

        partition.remove_empty_cells()
Example #10
0
 def sample(self, data, partition, alpha):
     n = partition.number_of_items
     
     for item, data_point in enumerate(data):
         old_cluster_label = partition.labels[item]
         old_value = partition.item_values[item]
         
         partition.remove_item(item, old_cluster_label)
         
         if partition.counts[old_cluster_label] == 0:
             p = [x / (n - 1) for x in partition.counts]
             
             new_cluster_label = discrete_rvs(p)
             
             new_value = partition.cell_values[new_cluster_label]
             
             old_ll = self.cluster_density.log_p(data_point, old_value)
             new_ll = self.cluster_density.log_p(data_point, new_value)
             
             log_ratio = log(n - 1) - log(alpha) + new_ll - old_ll
             
             u = uniform_rvs(0, 1)
             
             if log_ratio >= log(u):
                 partition.add_item(item, new_cluster_label)
             else:
                 partition.add_item(item, old_cluster_label)
         
         else:
             new_value = self.base_measure.random()
             
             old_ll = self.cluster_density.log_p(data_point, old_value)
             new_ll = self.cluster_density.log_p(data_point, new_value)
             
             log_ratio = log(alpha) - log(n - 1) + new_ll - old_ll
             
             u = uniform_rvs(0, 1)
             
             if log_ratio >= log(u):
                 partition.add_cell(new_value)
                 
                 cell = partition.get_cell_by_value(new_value)
                 
                 cell.add_item(item)
             else:
                 partition.add_item(item, old_cluster_label)
     
     partition.remove_empty_cells()
     
     for item, data_point in enumerate(data):
         old_cluster_label = partition.labels[item]
         
         if partition.cells[old_cluster_label].size == 1:
             continue
         
         partition.remove_item(item, old_cluster_label)
         
         log_p = []
         
         for cell in partition.cells:
             cluster_log_p = self.cluster_density.log_p(data_point, cell.value)
             
             counts = cell.size
             
             log_p.append(log(counts) + cluster_log_p)
 
         log_p = log_space_normalise(log_p)
         
         p = [exp(x) for x in log_p]
         
         new_cluster_label = discrete_rvs(p)
         
         partition.add_item(item, new_cluster_label)
     
     partition.remove_empty_cells()