def _get_probabilities(self, X, Y, S, S_combinations): data_vectors = self._data.get_data_vectors() N = len(data_vectors[data_vectors.keys()[0]]) X_values = data_vectors[X] Y_values = data_vectors[Y] observed_prob_dict = {} # Now we look for the value x of the variable X, and value y of the variable Y for x in self._values_dict[X]: # finding matches for x x_indices = set([element_index for (element_index, element) in enumerate(X_values) if element == x]) observed_prob_dict['P(' + X + '=' + x + ')'] = len(x_indices) / float(N) for y in self._values_dict[Y]: # finding matches for y y_indices = set([element_index for (element_index, element) in enumerate(Y_values) if element == y]) observed_prob_dict['P(' + Y + '=' + y + ')'] = len(y_indices) / float(N) xy = x_indices.intersection(y_indices) observed_prob_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ')'] = len(xy) / float(N) for S_combination in S_combinations: z_indices = PGMUtils.get_z_indices(S, S_combination, data_vectors) z = z_indices y_z = y_indices.intersection(z) x_z = x_indices.intersection(z) xyz = xy.intersection(z) observed_prob_dict['P(' + X + '=' + x + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(x_z) / float(len(z)) observed_prob_dict['P(' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(z) / float(N) observed_prob_dict['P(' + Y + '=' + y + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(y_z) / float(len(z)) observed_prob_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ',' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(xyz) / float(N) return observed_prob_dict
def get_estimated_cpds(self): values_dict = self._data.get_variable_values_sets() cpds = [] for node in self._network: parents = self._network.predecessors(node) value_combinations = PGMUtils.get_combinations(parents, values_dict) probability_dict = self._get_probabilities(node, parents, value_combinations) cpds.append(probability_dict) return cpds
def _are_dseparated(self, X, Y, S, n): H_X = H_Y = H_XY = 0 S_combinations = PGMUtils.get_combinations(S, self._values_dict) probability_dict = self._get_probabilities(X, Y, S, S_combinations) for x in self._values_dict[X]: p_x = probability_dict['P(' + X + '=' + x + ')'] for y in self._values_dict[Y]: p_y = probability_dict['P(' + Y + '=' + y + ')'] # in case we are looking for zero order conditional dependency if len(S_combinations) == 0: H_Y += -log(p_y + 0.001) H_X += -log(p_x + 0.001) p_xy = probability_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ')'] H_XY += -log(p_xy + 0.001) else: for S_combination in S_combinations: p_y_z = probability_dict['P(' + Y + '=' + y + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] p_x_z = probability_dict['P(' + X + '=' + x + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] p_xyz = probability_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ',' + ','.join(S) + '=' + ','.join(S_combination) + ')'] p_z = probability_dict['P(' + ','.join(S) + '=' + ','.join(S_combination) + ')'] H_X += -log(p_x_z + 0.001) H_Y += -log(p_y_z + 0.001) H_XY += -log(p_xyz * p_z + 0.001) # If mutual information is greater than certain threshhold # then X and Y are dependent otherwise not n_X = 2 * len(self._values_dict[X]) n_Y = 2 * len(self._values_dict[Y]) n_XY = 4 * len(S_combinations) if n_XY == 0: n_XY = 4 MI = abs((H_X / n_X) + (H_Y / n_Y) - (H_XY / n_XY)) # print 'MI(', X + ',' + Y + '|' + ','.join(S), ') = ', MI self._nmis[X + ',' + Y + '|' + ','.join(S)] = MI if MI < self._mutual_info_thresholds[n]: return True return False