def conditional_entropy(self, col_a, col_b, n_samples=1000): """ Conditional entropy, H(A|B), of a given b. Implementation notes -------------------- Uses MonteCarlo integration at least in the joint entropy component. Parameters ---------- col_a : indexer The name of the first column col_b : indexer The name of the second column n_samples : int The number of samples to use for the Monte Carlo approximation (if nored if `col` is categorical). Returns ------- h_c : float The conditional entropy of `col_a` given `col_b`. """ col_idxs = [ self._converters['col2idx'][col_a], self._converters['col2idx'][col_b] ] h_ab = mu.joint_entropy(self._models, col_idxs, n_samples) h_b = self.entropy(col_b, n_samples) h_c = h_ab - h_b return h_c
def conditional_entropy(self, col_a, col_b, n_samples=1000): """ Conditional entropy, H(A|B), of a given b. Implementation notes -------------------- Uses MonteCarlo integration at least in the joint entropy component. Parameters ---------- col_a : indexer The name of the first column col_b : indexer The name of the second column n_samples : int The number of samples to use for the Monte Carlo approximation (if nored if `col` is categorical). Returns ------- h_c : float The conditional entropy of `col_a` given `col_b`. """ col_idxs = [self._converters['col2idx'][col_a], self._converters['col2idx'][col_b]] h_ab = mu.joint_entropy(self._models, col_idxs, n_samples) h_b = self.entropy(col_b, n_samples) h_c = h_ab - h_b return h_c
def mutual_information(self, col_a, col_b, normed=True, linfoot=False, n_samples=1000): """ The mutual information, I(A, B), between two columns. Parameters ---------- col_a : indexer The name of the first column col_b : indexer The name of the second column normed : bool If True, the mutual information, I, is normed according to the symmertic uncertainty, U = 2*I(A, B)/(H(A) + H(B)). n_samples : int The number of samples to use for the Monte Carlo approximation (if nored if `col` is categorical). Returns ------- mi : float The mutual information between `col_a` and `col_b`. """ if linfoot: normed = False idx_a = self._converters['col2idx'][col_a] idx_b = self._converters['col2idx'][col_b] models = [] for model in self._models: if model['col_assignment'][idx_a] == \ model['col_assignment'][idx_b]: models.append(model) if len(models) == 0: mi = 0.0 else: h_a = self.entropy(col_a, n_samples=n_samples) h_b = self.entropy(col_b, n_samples=n_samples) h_ab = mu.joint_entropy(models, [idx_a, idx_b], n_samples) mi = h_a + h_b - h_ab # XXX: Differential entropy can be negative. Here we prevent # negative mutual information. mi = max(mi, 0.) if normed: # normalize using symmetric uncertainty mi = 2. * mi / (h_a + h_b) if linfoot: mi = (1. - exp(-2 * mi))**.5 return mi
def mutual_information(self, col_a, col_b, normed=True, linfoot=False, n_samples=1000): """ The mutual information, I(A, B), between two columns. Parameters ---------- col_a : indexer The name of the first column col_b : indexer The name of the second column normed : bool If True, the mutual information, I, is normed according to the symmertic uncertainty, U = 2*I(A, B)/(H(A) + H(B)). n_samples : int The number of samples to use for the Monte Carlo approximation (if nored if `col` is categorical). Returns ------- mi : float The mutual information between `col_a` and `col_b`. """ if linfoot: normed = False idx_a = self._converters['col2idx'][col_a] idx_b = self._converters['col2idx'][col_b] models = [] for model in self._models: if model['col_assignment'][idx_a] == \ model['col_assignment'][idx_b]: models.append(model) if len(models) == 0: mi = 0.0 else: h_a = self.entropy(col_a, n_samples=n_samples) h_b = self.entropy(col_b, n_samples=n_samples) h_ab = mu.joint_entropy(models, [idx_a, idx_b], n_samples) mi = h_a + h_b - h_ab # XXX: Differential entropy can be negative. Here we prevent # negative mutual information. mi = max(mi, 0.) if normed: # normalize using symmetric uncertainty mi = 2.*mi/(h_a + h_b) if linfoot: mi = (1. - exp(-2*mi))**.5 return mi