def __init__(self, vars, var_values, data, factors=None, alpha=0.1): """ vars = the sequence of feature ids var_values = the sequence of feature values alpha = smoothing parameter data = the data slice (2d ndarray) upon which to grow a cltree factors = the already computed factors (this is when the model has already been conputed) """ Node.__init__(self, frozenset(vars)) self.vars = numpy.array(vars) self._alpha = alpha # # assuming all variables to be homogeneous # TODO: generalize this self._n_var_vals = var_values[0] self.var_values = numpy.array(var_values) # # assuming data is never None self._data = data self._cltree = CLTree(data, features=self.vars, n_feature_vals=self._n_var_vals, feature_vals=self.var_values, alpha=alpha, sparse=True, mem_free=True)
def test_compute_joint_probs(): # # creating the tree upon synthetic data clt = CLTree(data=data, alpha=0.0, sparse=True, mem_free=False) joint_counts = numpy.zeros((n_features, n_features, 2, 2)) for i in range(n_features): for j in range(n_features): if i != j: for instance in data: joint_counts[i, j, instance[i], instance[j]] += 1 print('Computed co freqs') print(joint_counts) print(clt._joint_freqs) assert_almost_equal(clt._joint_freqs, joint_counts) # # going to logs joint_probs = joint_counts / n_instances log_joint_probs = numpy.log(joint_probs) log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO # # to have a complete match the diagonal entries are left to zero for i in range(n_features): log_joint_probs[i, i] = 0 assert_almost_equal(log_joint_probs, clt._log_joint_probs) # # dense case clt = CLTree(data=data, alpha=0.0, sparse=False, mem_free=False) assert_almost_equal(log_joint_probs, clt._log_joint_probs) # # changing alpha alpha = 1.0 joint_probs = (joint_counts + alpha) / (n_instances + 4.0 * alpha) log_joint_probs = numpy.log(joint_probs) log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO # # to have a complete match the diagonal entries are left to zero for i in range(n_features): log_joint_probs[i, i] = 0 clt = CLTree(data=data, alpha=alpha, sparse=True, mem_free=False) assert_almost_equal(log_joint_probs, clt._log_joint_probs) # # doing a dense version clt = CLTree(data=data, alpha=alpha, sparse=False, mem_free=False) assert_almost_equal(log_joint_probs, clt._log_joint_probs)
def smooth_probs(self, alpha, data=None): """ The only waya to smooth here is to rebuild the whole tree """ self._alpha = alpha if data is not None: self._data = data # else: # raise ValueError('Cannot smooth without data') self._cltree = CLTree(data=self._data, features=self.vars, n_feature_vals=self._n_var_vals, feature_vals=self.var_values, alpha=alpha, # copy_mi=False, sparse=True, mem_free=True)
def test_eval_instance(): # # comparing against values taken from Nico's code nico_cltree_tree = numpy.array([-1, 2, 0, 2]) nico_cltree_tree[0] = 0 nico_cltree_lls = numpy.array([ -2.01490302054, -1.20397280433, -1.20397280433, -1.79175946923, -1.60943791243, -1.60943791243 ]) nico_cltree_subtree = numpy.array([-1, 0, 1]) nico_cltree_subtree[0] = 0 nico_cltree_sublls = numpy.array([ -1.09861228867, -0.69314718056, -0.69314718056, -1.79175946923, -1.09861228867, -0.69314718056 ]) # # growing the tree on data clt = CLTree(data, alpha=0.0, sparse=True, mem_free=False) print(clt) # assert_array_equal(nico_cltree_tree, # clt._tree) for i, instance in enumerate(data): ll = clt.eval(instance) ll_f = clt.eval_fact(instance) # assert_almost_equal(nico_cltree_lls[i], ll) assert_almost_equal(ll, ll_f) print(ll, nico_cltree_lls[i]) # # now by obscuring one column subdata = data[:, [0, 2, 3]] subclt = CLTree(subdata, features=numpy.array([0, 2, 3]), alpha=0.0, sparse=True, mem_free=False) print(subclt) # assert_array_equal(nico_cltree_subtree, # subclt._tree) for i, instance in enumerate(data): ll = subclt.eval(instance) ll_f = subclt.eval_fact(instance) assert_almost_equal(ll, ll_f) print(ll, nico_cltree_sublls[i])
def test_compute_probs_dense(): # # creating the tree upon synthetic data clt = CLTree(data=data, alpha=0.0, sparse=False, mem_free=False) counts_1 = data.sum(axis=0) counts_0 = n_instances - counts_1 counts = numpy.column_stack([counts_0, counts_1]) probs = counts / n_instances print('Computed marg freqs and probs', counts, probs) log_probs = numpy.log(probs) log_probs[numpy.isinf(log_probs)] = LOG_ZERO print('Computed marg logs for alpha=0', log_probs) print('CLT marg logs for alpha=0', clt._log_marg_probs) assert_almost_equal(clt._log_marg_probs, log_probs) assert_almost_equal(clt._marg_freqs, counts_1) # # now with another value for alpha alpha = 1.0 clt = CLTree(data=data, alpha=alpha, sparse=False, mem_free=False) probs = (counts + 2 * alpha) / (n_instances + 4 * alpha) print('Computed probs with alpha=', alpha, probs) log_probs = numpy.log(probs) log_probs[numpy.isinf(log_probs)] = LOG_ZERO print('Computed marg logs for alpha=', alpha, log_probs) print('CLT marg logs for alpha=', alpha, clt._log_marg_probs) assert_almost_equal(clt._log_marg_probs, log_probs) assert_almost_equal(clt._marg_freqs, counts_1)
def test_eval_instance(): # # comparing against values taken from Nico's code nico_cltree_tree = numpy.array([-1, 2, 0, 2]) nico_cltree_tree[0] = 0 nico_cltree_lls = numpy.array([-2.01490302054, -1.20397280433, -1.20397280433, -1.79175946923, -1.60943791243, -1.60943791243]) nico_cltree_subtree = numpy.array([-1, 0, 1]) nico_cltree_subtree[0] = 0 nico_cltree_sublls = numpy.array([-1.09861228867, -0.69314718056, -0.69314718056, -1.79175946923, -1.09861228867, -0.69314718056]) # # growing the tree on data clt = CLTree(data, alpha=0.0, sparse=True, mem_free=False) print(clt) # assert_array_equal(nico_cltree_tree, # clt._tree) for i, instance in enumerate(data): ll = clt.eval(instance) ll_f = clt.eval_fact(instance) # assert_almost_equal(nico_cltree_lls[i], ll) assert_almost_equal(ll, ll_f) print(ll, nico_cltree_lls[i]) # # now by obscuring one column subdata = data[:, [0, 2, 3]] subclt = CLTree(subdata, features=numpy.array([0, 2, 3]), alpha=0.0, sparse=True, mem_free=False) print(subclt) # assert_array_equal(nico_cltree_subtree, # subclt._tree) for i, instance in enumerate(data): ll = subclt.eval(instance) ll_f = subclt.eval_fact(instance) assert_almost_equal(ll, ll_f) print(ll, nico_cltree_sublls[i])
def test_compute_mi(): counts_1 = data.sum(axis=0) counts_0 = n_instances - counts_1 counts = numpy.column_stack([counts_0, counts_1]) probs = counts / n_instances print('Computed marg freqs and probs', counts, probs) log_probs = numpy.log(probs) log_probs[numpy.isinf(log_probs)] = LOG_ZERO log_prods = numpy.zeros((n_features, n_features, 2, 2)) for i in range(n_features): for j in range(n_features): if i != j: for k in range(2): for h in range(2): log_prods[i, j, k, h] = \ log_probs[i, k] + log_probs[j, h] joint_counts = numpy.zeros((n_features, n_features, 2, 2)) for i in range(n_features): for j in range(n_features): if i != j: for instance in data: joint_counts[i, j, instance[i], instance[j]] += 1 print('Computed co freqs') print(joint_counts) # # going to logs joint_probs = joint_counts / n_instances log_joint_probs = numpy.log(joint_probs) log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO for i in range(n_features): log_joint_probs[i, i] = 0 mutual_info = numpy.exp(log_joint_probs) * (log_joint_probs - log_prods) mutual_info = mutual_info.sum(axis=2).sum(axis=2) print('Computed MI:', mutual_info, type(mutual_info)) clt = CLTree(data, alpha=0.0, sparse=False, mem_free=False) print('CLTree', clt._mutual_info, type(clt._mutual_info)) assert_almost_equal(mutual_info, clt._mutual_info) # # adding sparsity clt = CLTree(data, alpha=0.0, sparse=True, mem_free=False) assert_almost_equal(mutual_info, clt._mutual_info) # # now with alpha alpha = 0.5 probs = (counts + 2 * alpha) / (n_instances + 4 * alpha) log_probs = numpy.log(probs) log_probs[numpy.isinf(log_probs)] = LOG_ZERO log_prods = numpy.zeros((n_features, n_features, 2, 2)) for i in range(n_features): for j in range(n_features): if i != j: for k in range(2): for h in range(2): log_prods[i, j, k, h] = \ log_probs[i, k] + log_probs[j, h] joint_probs = (joint_counts + alpha) / (n_instances + 4 * alpha) log_joint_probs = numpy.log(joint_probs) log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO for i in range(n_features): log_joint_probs[i, i] = 0 mutual_info = numpy.exp(log_joint_probs) * (log_joint_probs - log_prods) mutual_info = mutual_info.sum(axis=2).sum(axis=2) clt = CLTree(data, alpha=alpha, sparse=False, mem_free=False) assert_almost_equal(mutual_info, clt._mutual_info) clt = CLTree(data, alpha=alpha, sparse=True, mem_free=False) assert_almost_equal(mutual_info, clt._mutual_info)
def test_compute_cond_probs(): # # creating the tree upon synthetic data clt = CLTree(data=data, alpha=0.0, sparse=False, mem_free=False) # cond_probs = numpy.zeros((n_features, # n_features, # 2, # 2)) # for i in range(n_features): # for j in range(n_features): # if i != j: # for instance in data: # cond_probs[i, j, instance[i], instance[j]] += 1 # # # # now normalizing # sums = cond_probs[i, j].sum(axis=0) # print('sums', sums) # cond_probs[i, j] /= sums # cond_probs[numpy.isnan(cond_probs)] = 0 # print ('Computed cond probs', cond_probs) # log_cond_probs = numpy.log(cond_probs) # log_cond_probs[numpy.isinf(log_cond_probs)] = LOG_ZERO # for i in range(n_features): # log_cond_probs[i, i] = 0 # print(log_cond_probs) # print('logs\n', clt._log_cond_probs) # assert_almost_equal(log_cond_probs, clt._log_cond_probs) joint_counts = numpy.zeros((n_features, n_features, 2, 2)) for i in range(n_features): for j in range(n_features): if i != j: for instance in data: joint_counts[i, j, instance[i], instance[j]] += 1 print('Computed co freqs') print(joint_counts) print(clt._joint_freqs) assert_almost_equal(clt._joint_freqs, joint_counts) # # checking sparseness clt = CLTree(data=data, alpha=0.0, sparse=True, mem_free=False) assert_almost_equal(clt._joint_freqs, joint_counts) # # going to logs joint_probs = joint_counts / n_instances log_joint_probs = numpy.log(joint_probs) log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO # # to have a complete match the diagonal entries are left to zero for i in range(n_features): log_joint_probs[i, i] = 0 counts_1 = data.sum(axis=0) counts_0 = n_instances - counts_1 counts = numpy.column_stack([counts_0, counts_1]) probs = counts / n_instances print('Computed marg freqs and probs', counts, probs) log_probs = numpy.log(probs) log_probs[numpy.isinf(log_probs)] = LOG_ZERO log_cond_probs = numpy.zeros((n_features, n_features, 2, 2)) for i in range(n_features): for j in range(n_features): if i != j: for k in range(2): for h in range(2): log_cond_probs[i, j, k, h] = \ log_joint_probs[i, j, k, h] - log_probs[j, h] print('Computed log cond probs with alpha=0', log_cond_probs) assert_array_almost_equal(log_cond_probs, clt._log_cond_probs) # # testing factors print('\nTesting factors') for i in range(n_features): parent_id = clt._tree[i] for k in range(2): for h in range(2): if i != parent_id: print(clt._log_cond_probs[i, parent_id, k, h], clt._factors[i, k, h]) assert_array_almost_equal( clt._log_cond_probs[i, parent_id, k, h], clt._factors[i, k, h]) alpha = 2.0 clt = CLTree(data, alpha=alpha, sparse=False, mem_free=False) # # going to logs joint_probs = (joint_counts + alpha) / (n_instances + 4 * alpha) log_joint_probs = numpy.log(joint_probs) log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO # # to have a complete match the diagonal entries are left to zero for i in range(n_features): log_joint_probs[i, i] = 0 counts_1 = data.sum(axis=0) counts_0 = n_instances - counts_1 counts = numpy.column_stack([counts_0, counts_1]) probs = (counts + 2 * alpha) / (n_instances + 4 * alpha) print('Computed marg freqs and probs', counts, probs) log_probs = numpy.log(probs) log_probs[numpy.isinf(log_probs)] = LOG_ZERO log_cond_probs = numpy.zeros((n_features, n_features, 2, 2)) for i in range(n_features): for j in range(n_features): if i != j: for k in range(2): for h in range(2): log_cond_probs[i, j, k, h] = \ log_joint_probs[i, j, k, h] - log_probs[j, h] print('Computed log cond probs with alpha=0', log_cond_probs) assert_array_almost_equal(log_cond_probs, clt._log_cond_probs) # # sparse version clt = CLTree(data, alpha=alpha, sparse=True, mem_free=False) assert_array_almost_equal(log_cond_probs, clt._log_cond_probs)
class CLTreeNode(Node): """ An input node representing a Chow-Liu Tree over a set of r.v. """ def __init__(self, vars, var_values, data, factors=None, alpha=0.1): """ vars = the sequence of feature ids var_values = the sequence of feature values alpha = smoothing parameter data = the data slice (2d ndarray) upon which to grow a cltree factors = the already computed factors (this is when the model has already been conputed) """ Node.__init__(self, frozenset(vars)) self.vars = numpy.array(vars) self._alpha = alpha # # assuming all variables to be homogeneous # TODO: generalize this self._n_var_vals = var_values[0] self.var_values = numpy.array(var_values) # # assuming data is never None self._data = data self._cltree = CLTree(data, features=self.vars, n_feature_vals=self._n_var_vals, feature_vals=self.var_values, alpha=alpha, sparse=True, mem_free=True) def smooth_probs(self, alpha, data=None): """ The only waya to smooth here is to rebuild the whole tree """ self._alpha = alpha if data is not None: self._data = data # else: # raise ValueError('Cannot smooth without data') self._cltree = CLTree(data=self._data, features=self.vars, n_feature_vals=self._n_var_vals, feature_vals=self.var_values, alpha=alpha, # copy_mi=False, sparse=True, mem_free=True) def eval(self, obs): """ Dispatching inference to the cltree """ # # TODO: do something for the derivatives self.log_der = LOG_ZERO # self.log_val = self._cltree.eval(obs) self.log_val = self._cltree.eval_fact(obs) def mpe_eval(self, obs): """ WRITEME """ raise NotImplementedError('MPE inference not yet implemented') def n_children(self): return len(self.vars) def node_type_str(self): return CHOW_LIU_TREE_NODE_SYM def node_short_str(self): vars_str = ','.join([var for var in self.vars]) return "{type} {id}" +\ " <{vars}>" +\ " {tree} {factors}".format(type=self.node_type_str(), id=self.id, vars=vars_str, tree=self._cltree.tree_repr(), factors=self._cltree.factors_repr()) def __repr__(self): """ WRITEME """ base = Node.__repr__(self) return ("""CLTree Smoothed Node {line1} vars: {vars} vals: {vals} tree:{tree}""". format(line1=base, vars=self.vars, vals=self._n_var_vals, tree=self._cltree.tree_repr()))