def compute_log_inv_acc_p(self, node_id, param, len_both_children_terminal, loglik, grow_nodes, \ cache, settings, data): # 1/acc for PRUNE is acc for GROW except for corrections to both_children_terminal # and grow_nodes list logprior_children = 0.0 left, right = get_children_id(node_id) if not no_valid_split_exists(data, cache, self.train_ids[left], settings): logprior_children += np.log(self.compute_pnosplit(left, param)) if not no_valid_split_exists(data, cache, self.train_ids[right], settings): logprior_children += np.log(self.compute_pnosplit(right, param)) try: check_if_zero(logprior_children - self.logprior[left] - self.logprior[right]) except AssertionError: print 'oh oh ... looks like a bug in compute_log_inv_acc_p' print 'term 1 = %s' % logprior_children print 'term 2 = %s, 2a = %s, 2b = %s' % (self.logprior[left]+self.logprior[right], \ self.logprior[left], self.logprior[right]) print 'node_id = %s, left = %s, right = %s, logprior = %s' % (node_id, left, right, self.logprior) raise AssertionError log_inv_acc_prior = np.log(self.compute_psplit(node_id, param)) \ - np.log(self.compute_pnosplit(node_id, param)) \ -np.log(len_both_children_terminal) + np.log(len(grow_nodes)) \ + logprior_children log_inv_acc_loglik = (loglik - self.loglik[node_id]) log_inv_acc = log_inv_acc_loglik + log_inv_acc_prior if settings.verbose >= 2: print 'compute_log_inv_acc_p: log_acc_loglik = %s, log_acc_prior = %s' \ % (-log_inv_acc_loglik, -log_inv_acc_prior) assert(log_inv_acc > -np.inf) return log_inv_acc
def check_if_same(self, log_acc, loglik_diff, logprior_diff): # change/swap operations should depend only on what happens in current subtree loglik_diff_2 = sum([self.loglik_new[node] for node in self.leaf_nodes]) \ - sum([self.loglik[node] for node in self.leaf_nodes]) logprior_diff_2 = sum([self.logprior_new[node] for node in self.logprior_new]) \ - sum([self.logprior[node] for node in self.logprior]) log_acc_2 = loglik_diff_2 + logprior_diff_2 try: check_if_zero(log_acc - log_acc_2) except AssertionError: if not ((log_acc == -np.inf) and (log_acc_2 == -np.inf)): print 'check if terms match:' print 'loglik_diff = %s, loglik_diff_2 = %s' % (loglik_diff, loglik_diff_2) print 'logprior_diff = %s, logprior_diff_2 = %s' % (logprior_diff, logprior_diff_2) raise AssertionError
def evaluate_new_subtree(self, data, node_id_start, param, nodes_subtree, cache, settings): for i in self.train_ids[node_id_start]: x_, y_ = data['x_train'][i, :], data['y_train'][i] node_id = copy(node_id_start) while True: self.sum_y_new[node_id] += y_ self.sum_y2_new[node_id] += y_**2 self.n_points_new[node_id] += 1 self.train_ids_new[node_id] = np.append( self.train_ids_new[node_id], i) if node_id in self.leaf_nodes: break left, right = get_children_id(node_id) feat_id, split, idx_split_global = self.node_info_new[ node_id] # splitting on new criteria if x_[feat_id] <= split: node_id = left else: node_id = right for node_id in nodes_subtree: self.loglik_new[node_id] = -np.inf if self.n_points_new[node_id] > 0: self.loglik_new[node_id], self.param_n_new[node_id] = \ compute_normal_normalizer(self.sum_y_new[node_id], self.sum_y2_new[node_id], \ self.n_points_new[node_id], param, cache, settings) if node_id in self.leaf_nodes: if stop_split(self.train_ids_new[node_id], settings, data, cache): # if leaf is empty, logprior_new[node_id] = 0.0 is incorrect; however # loglik_new[node_id] = -np.inf will reject move to a tree with empty leaves self.logprior_new[node_id] = 0.0 else: # node with just 1 data point earlier could have more data points now self.logprior_new[node_id] = np.log( self.compute_pnosplit(node_id, param)) else: # split probability might have changed if train_ids have changed self.recompute_prob_split(data, param, settings, cache, node_id) if settings.debug == 1: try: check_if_zero(self.loglik[node_id_start] - self.loglik_new[node_id_start]) except AssertionError: print('train_ids[node_id_start] = %s, train_ids_new[node_id_start] = %s' \ % (self.train_ids[node_id_start], self.train_ids_new[node_id_start])) raise AssertionError
def update_p(self, particles, log_weights, log_pd, settings): node_info_old = {} first_iter = False if settings.verbose >= 2: print('log_weights = %s' % log_weights) k = sample_multinomial(softmax(log_weights)) try: node_info_old = self.p.node_info except AttributeError: first_iter = True # first iteration probably: self.p would not be present pass same_tree = node_info_old == particles[k].node_info self.p = particles[k] self.log_pd = log_pd if settings.verbose >= 2: print('pid_sampled = %s' % k) print('new tree:') self.p.print_tree() if not same_tree and settings.verbose >= 1: print('non-identical trees') if k == 0 and not first_iter: assert same_tree elif same_tree and not first_iter: # particles from pmcmc during init might be different if settings.verbose >= 1: print('identical tree without k == 0') try: check_if_zero(log_weights[k] - log_weights[0]) except AssertionError: print('node_info_old = %s' % node_info_old) print('same_tree = %s' % same_tree) print('k = %s, particles[k].node_info = %s' % (k, particles[k].node_info)) print('log_weights[0] = %s, log_weights[k] = %s' % \ (log_weights[0], log_weights[k])) if not first_iter: print(p_old.log_sis_ratio_d) print(particles[0].log_sis_ratio_d) print(particles[k].log_sis_ratio_d) raise AssertionError self.p.check_depth() if settings.verbose >= 2: print('sampled particle = %5d, ancestry = %s' % (k, self.p.ancestry)) return not same_tree
def update_p(self, particles, log_weights, log_pd, settings): node_info_old = {} first_iter = False if settings.verbose >= 2: print 'log_weights = %s' % log_weights k = sample_multinomial(softmax(log_weights)) try: node_info_old = self.p.node_info except AttributeError: first_iter = True # first iteration probably: self.p would not be present pass same_tree = node_info_old == particles[k].node_info self.p = particles[k] self.log_pd = log_pd if settings.verbose >= 2: print 'pid_sampled = %s' % k print 'new tree:' self.p.print_tree() if not same_tree and settings.verbose >=1: print 'non-identical trees' if k == 0 and not first_iter: assert same_tree elif same_tree and not first_iter: # particles from pmcmc during init might be different if settings.verbose >= 1: print 'identical tree without k == 0' try: check_if_zero(log_weights[k] - log_weights[0]) except AssertionError: print 'node_info_old = %s' % node_info_old print 'same_tree = %s' % same_tree print 'k = %s, particles[k].node_info = %s' % (k, particles[k].node_info) print 'log_weights[0] = %s, log_weights[k] = %s' % \ (log_weights[0], log_weights[k]) if not first_iter: print p_old.log_sis_ratio_d print particles[0].log_sis_ratio_d print particles[k].log_sis_ratio_d raise AssertionError self.p.check_depth() if settings.verbose >= 2: print 'sampled particle = %5d, ancestry = %s' % (k, self.p.ancestry) return not same_tree
def evaluate_new_subtree(self, data, node_id_start, param, nodes_subtree, cache, settings): for i in self.train_ids[node_id_start]: x_, y_ = data['x_train'][i, :], data['y_train'][i] node_id = copy(node_id_start) while True: self.sum_y_new[node_id] += y_ self.sum_y2_new[node_id] += y_ ** 2 self.n_points_new[node_id] += 1 self.train_ids_new[node_id] = np.append(self.train_ids_new[node_id], i) if node_id in self.leaf_nodes: break left, right = get_children_id(node_id) feat_id, split, idx_split_global = self.node_info_new[node_id] # splitting on new criteria if x_[feat_id] <= split: node_id = left else: node_id = right for node_id in nodes_subtree: self.loglik_new[node_id] = -np.inf if self.n_points_new[node_id] > 0: self.loglik_new[node_id], self.param_n_new[node_id] = \ compute_normal_normalizer(self.sum_y_new[node_id], self.sum_y2_new[node_id], \ self.n_points_new[node_id], param, cache, settings) if node_id in self.leaf_nodes: if stop_split(self.train_ids_new[node_id], settings, data, cache): # if leaf is empty, logprior_new[node_id] = 0.0 is incorrect; however # loglik_new[node_id] = -np.inf will reject move to a tree with empty leaves self.logprior_new[node_id] = 0.0 else: # node with just 1 data point earlier could have more data points now self.logprior_new[node_id] = np.log(self.compute_pnosplit(node_id, param)) else: # split probability might have changed if train_ids have changed self.recompute_prob_split(data, param, settings, cache, node_id) if settings.debug == 1: try: check_if_zero(self.loglik[node_id_start] - self.loglik_new[node_id_start]) except AssertionError: print 'train_ids[node_id_start] = %s, train_ids_new[node_id_start] = %s' \ % (self.train_ids[node_id_start], self.train_ids_new[node_id_start]) raise AssertionError