def iamb(data, alpha=0.05, feature_selection=None, debug=False): """ IAMB Algorithm for learning the structure of a Discrete Bayesian Network from data. Arguments --------- *data* : a nested numpy array *alpha* : a float The type II error rate. *feature_selection* : None or a string Whether to use IAMB as a structure learning or feature selection algorithm. Returns ------- *bn* : a BayesNet object or *mb* : the markov blanket of a node Effects ------- None Notes ----- - Works but there are definitely some bugs. Speed Test: *** 5 vars, 624 obs *** - 196 ms """ n_rv = data.shape[1] Mb = dict([(rv,[]) for rv in range(n_rv)]) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: V = set(range(n_rv)) - {T} Mb_change=True # GROWING PHASE while Mb_change: Mb_change = False # find X_max in V-Mb(T)-{T} that maximizes # mutual information of X,T|Mb(T) # i.e. max of mi_test(data[:,(X,T,Mb(T))]) max_val = -1 max_x = None for X in V - set(Mb[T]) - {T}: cols = (X,T)+tuple(Mb[T]) mi_val = mi_test(data[:,cols],test=False) if mi_val > max_val: max_val = mi_val max_x = X # if Xmax is dependent on T given Mb(T) cols = (max_x,T) + tuple(Mb[T]) if max_x is not None and are_independent(data[:,cols]): Mb[T].append(X) Mb_change = True if debug: print 'Adding %s to MB of %s' % (str(X), str(T)) # SHRINKING PHASE for X in Mb[T]: # if x is independent of t given Mb(T) - {x} cols = (X,T) + tuple(set(Mb[T]) - {X}) if are_independent(data[:,cols],alpha): Mb[T].remove(X) if debug: print 'Removing %s from MB of %s' % (str(X), str(T)) if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) if debug: print 'Unoriented edge dict:\n %s' % str(edge_dict) print 'MB: %s' % str(Mb) # ORIENT EDGES oriented_edge_dict = orient_edges_gs2(edge_dict,Mb,data,alpha) if debug: print 'Oriented edge dict:\n %s' % str(oriented_edge_dict) # CREATE BAYESNET OBJECT value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) bn=BayesNet(oriented_edge_dict,value_dict) return bn else: return Mb[_T]
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False): """ From [1]: "A novel algorithm for the induction of Markov blankets from data, called Fast-IAMB, that employs a heuristic to quickly recover the Markov blanket. Empirical results show that Fast-IAMB performs in many cases faster and more reliably than existing algorithms without adversely affecting the accuracy of the recovered Markov blankets." Arguments --------- *data* : a nested numpy array *k* : an integer The max number of edges to add at each iteration of the algorithm. *alpha* : a float Probability of Type I error Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- - Currently does not work. I think it's stuck in an infinite loop... """ # get values value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) # replace strings data = replace_strings(data) n_rv = data.shape[1] Mb = dict([(rv,[]) for rv in range(n_rv)]) N = data.shape[0] card = dict(zip(range(n_rv),unique_bins(data))) #card = dict(zip(range(data.shape[1]),np.amax(data,axis=0))) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: S = set(range(n_rv)) - {T} for A in S: if not are_independent(data[:,(A,T)]): S.remove(A) s_h_dict = dict([(s,0) for s in S]) while S: insufficient_data = False break_grow_phase = False #### GROW PHASE #### # Calculate mutual information for all variables mi_dict = dict([(s,mi_test(data[:,(s,T)+tuple(Mb[T])])) for s in S]) for x_i in sorted(mi_dict, key=mi_dict.get,reverse=True): # Add top MI-score variables until there isn't enough data for bins if (N / card[x_i]*card[T]*np.prod([card[b] for b in Mb[T]])) >= k: Mb[T].append(x_i) else: insufficient_data = True break #### SHRINK PHASE #### removed_vars = False for A in Mb[T]: cols = (A,T) + tuple(set(Mb[T]) - {A}) # if A is independent of T given Mb[T], remove A if are_independent(data[:,cols]): Mb[T].remove(A) removed_vars=True #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS #### if insufficient_data and not removed_vars: if debug: print 'Breaking..' break else: A = set(range(n_rv)) - {T} - set(Mb[T]) #A = set(nodes) - {T} - set(Mb[T]) S = set() for a in A: cols = (a,T) + tuple(Mb[T]) if are_independent(data[:,cols]): S.add(a) if debug: print 'Done with %s' % T if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) # ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha) # CREATE BAYESNET OBJECT bn=BayesNet(oriented_edge_dict,value_dict) return BN else: return Mb[_T]
def iamb(data, alpha=0.05, feature_selection=None, debug=False): """ IAMB Algorithm for learning the structure of a Discrete Bayesian Network from data. Arguments --------- *data* : a nested numpy array *alpha* : a float The type II error rate. *feature_selection* : None or a string Whether to use IAMB as a structure learning or feature selection algorithm. Returns ------- *bn* : a BayesNet object or *mb* : the markov blanket of a node Effects ------- None Notes ----- - Works but there are definitely some bugs. Speed Test: *** 5 vars, 624 obs *** - 196 ms """ n_rv = data.shape[1] Mb = dict([(rv, []) for rv in range(n_rv)]) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list) ), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: V = set(range(n_rv)) - {T} Mb_change = True # GROWING PHASE while Mb_change: Mb_change = False # find X_max in V-Mb(T)-{T} that maximizes # mutual information of X,T|Mb(T) # i.e. max of mi_test(data[:,(X,T,Mb(T))]) max_val = -1 max_x = None for X in V - set(Mb[T]) - {T}: cols = (X, T) + tuple(Mb[T]) mi_val = mi_test(data[:, cols], test=False) if mi_val > max_val: max_val = mi_val max_x = X # if Xmax is dependent on T given Mb(T) cols = (max_x, T) + tuple(Mb[T]) if max_x is not None and are_independent(data[:, cols]): Mb[T].append(X) Mb_change = True if debug: print 'Adding %s to MB of %s' % (str(X), str(T)) # SHRINKING PHASE for X in Mb[T]: # if x is independent of t given Mb(T) - {x} cols = (X, T) + tuple(set(Mb[T]) - {X}) if are_independent(data[:, cols], alpha): Mb[T].remove(X) if debug: print 'Removing %s from MB of %s' % (str(X), str(T)) if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) if debug: print 'Unoriented edge dict:\n %s' % str(edge_dict) print 'MB: %s' % str(Mb) # ORIENT EDGES oriented_edge_dict = orient_edges_gs2(edge_dict, Mb, data, alpha) if debug: print 'Oriented edge dict:\n %s' % str(oriented_edge_dict) # CREATE BAYESNET OBJECT value_dict = dict( zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) bn = BayesNet(oriented_edge_dict, value_dict) return bn else: return Mb[_T]
def lambda_iamb(data, L=1.5, alpha=0.05, feature_selection=None): """ Lambda IAMB Algorithm for learning the structure of a Discrete Bayesian Network from data. This Algorithm is similar to the iamb algorithm, except that it allows for a "lambda" coefficient that helps avoid false positives. This algorithm was originally developed for use as a feature selection algorithm - discovering the markov blanket of a target variable is equivalent to discovering the relevant features for classifications. In practice, this algorithm does just as well as a feature selection method compared to IAMB when naive bayes was used as a classifier, but Lambda-iamb actually does much better than traditional iamb when traditional iamb does very poorly due to high false positive rates. Arguments --------- *data* : a nested numpy array *L* : a float The lambda hyperparameter - see [1]. *alpha* : a float The type II error rate. Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- """ n_rv = data.shape[1] Mb = dict([(rv, {}) for rv in range(n_rv)]) if feature_selection is None: _T = range(n_rv) else: assert not isinstance(feature_selection, list), "feature_selection must be only one value" _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: V = set(range(n_rv)) - {T} Mb_change = True # GROWING PHASE while Mb_change: Mb_change = False cols = tuple({T}) + tuple(Mb[T]) H_tmb = entropy(data[:, cols]) # find X1_min in V-Mb[T]-{T} that minimizes # entropy of T|X1_inMb[T] # i.e. min of entropy(data[:,(T,X,Mb[T])]) min_val1, min_val2 = 1e7, 1e7 min_x1, min_x2 = None, None for X in V - Mb[T] - {T}: cols = (T, X) + tuple(Mb[T]) ent_val = entropy(data[:, cols]) if ent_val < min_val: min_val2, min_val1 = min_val1, ent_val min_x2, min_x1 = min_x1, X # if min_x1 is dependent on T given Mb[T]... cols = (min_x1, T) + tuple(Mb[T]) if are_independent(data[:, cols]): if (min_val2 - L * min_val1) < ((1 - L) * H_tmb): cols = (min_x2, T) + tuple(Mb[T]) if are_independent(data[:, cols]): Mb[T].add(min_x1) Mb[T].add(min_x2) Mb_change = True else: Mb[T].add(X) Mb_change = True # SHRINKING PHASE for X in Mb[T]: # if x is indepdent of t given Mb[T] - {x} cols = (X, T) + tuple(Mb[T] - {X}) if mi_test(data[:, cols]) > alpha: Mb[T].remove(X) if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) # ORIENT EDGES oriented_edge_dict = orient_edges_Mb(edge_dict, Mb, data, alpha) # CREATE BAYESNET OBJECT value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) bn = BayesNet(oriented_edge_dict, value_dict) return bn else: return Mb[_T]
def lambda_iamb(data, L=1.5, alpha=0.05, feature_selection=None): """ Lambda IAMB Algorithm for learning the structure of a Discrete Bayesian Network from data. This Algorithm is similar to the iamb algorithm, except that it allows for a "lambda" coefficient that helps avoid false positives. This algorithm was originally developed for use as a feature selection algorithm - discovering the markov blanket of a target variable is equivalent to discovering the relevant features for classifications. In practice, this algorithm does just as well as a feature selection method compared to IAMB when naive bayes was used as a classifier, but Lambda-iamb actually does much better than traditional iamb when traditional iamb does very poorly due to high false positive rates. Arguments --------- *data* : a nested numpy array *L* : a float The lambda hyperparameter - see [1]. *alpha* : a float The type II error rate. Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- """ n_rv = data.shape[1] Mb = dict([(rv, {}) for rv in range(n_rv)]) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list) ), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: V = set(range(n_rv)) - {T} Mb_change = True # GROWING PHASE while Mb_change: Mb_change = False cols = tuple({T}) + tuple(Mb[T]) H_tmb = entropy(data[:, cols]) # find X1_min in V-Mb[T]-{T} that minimizes # entropy of T|X1_inMb[T] # i.e. min of entropy(data[:,(T,X,Mb[T])]) min_val1, min_val2 = 1e7, 1e7 min_x1, min_x2 = None, None for X in V - Mb[T] - {T}: cols = (T, X) + tuple(Mb[T]) ent_val = entropy(data[:, cols]) if ent_val < min_val: min_val2, min_val1 = min_val1, ent_val min_x2, min_x1 = min_x1, X # if min_x1 is dependent on T given Mb[T]... cols = (min_x1, T) + tuple(Mb[T]) if are_independent(data[:, cols]): if (min_val2 - L * min_val1) < ((1 - L) * H_tmb): cols = (min_x2, T) + tuple(Mb[T]) if are_independent(data[:, cols]): Mb[T].add(min_x1) Mb[T].add(min_x2) Mb_change = True else: Mb[T].add(X) Mb_change = True # SHRINKING PHASE for X in Mb[T]: # if x is indepdent of t given Mb[T] - {x} cols = (X, T) + tuple(Mb[T] - {X}) if mi_test(data[:, cols]) > alpha: Mb[T].remove(X) if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) # ORIENT EDGES oriented_edge_dict = orient_edges_Mb(edge_dict, Mb, data, alpha) # CREATE BAYESNET OBJECT value_dict = dict( zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) bn = BayesNet(oriented_edge_dict, value_dict) return bn else: return Mb[_T]
def fast_iamb(data, k=5, alpha=0.05, feature_selection=None, debug=False): """ From [1]: "A novel algorithm for the induction of Markov blankets from data, called Fast-IAMB, that employs a heuristic to quickly recover the Markov blanket. Empirical results show that Fast-IAMB performs in many cases faster and more reliably than existing algorithms without adversely affecting the accuracy of the recovered Markov blankets." Arguments --------- *data* : a nested numpy array *k* : an integer The max number of edges to add at each iteration of the algorithm. *alpha* : a float Probability of Type I error Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- - Currently does not work. I think it's stuck in an infinite loop... """ # get values value_dict = dict( zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) # replace strings data = replace_strings(data) n_rv = data.shape[1] Mb = dict([(rv, []) for rv in range(n_rv)]) N = data.shape[0] card = dict(zip(range(n_rv), unique_bins(data))) #card = dict(zip(range(data.shape[1]),np.amax(data,axis=0))) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list) ), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: S = set(range(n_rv)) - {T} for A in S: if not are_independent(data[:, (A, T)]): S.remove(A) s_h_dict = dict([(s, 0) for s in S]) while S: insufficient_data = False break_grow_phase = False #### GROW PHASE #### # Calculate mutual information for all variables mi_dict = dict([(s, mi_test(data[:, (s, T) + tuple(Mb[T])])) for s in S]) for x_i in sorted(mi_dict, key=mi_dict.get, reverse=True): # Add top MI-score variables until there isn't enough data for bins if (N / card[x_i] * card[T] * np.prod([card[b] for b in Mb[T]])) >= k: Mb[T].append(x_i) else: insufficient_data = True break #### SHRINK PHASE #### removed_vars = False for A in Mb[T]: cols = (A, T) + tuple(set(Mb[T]) - {A}) # if A is independent of T given Mb[T], remove A if are_independent(data[:, cols]): Mb[T].remove(A) removed_vars = True #### FINALIZE BLANKET FOR "T" OR MAKE ANOTHER PASS #### if insufficient_data and not removed_vars: if debug: print 'Breaking..' break else: A = set(range(n_rv)) - {T} - set(Mb[T]) #A = set(nodes) - {T} - set(Mb[T]) S = set() for a in A: cols = (a, T) + tuple(Mb[T]) if are_independent(data[:, cols]): S.add(a) if debug: print 'Done with %s' % T if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) # ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict, Mb, data, alpha) # CREATE BAYESNET OBJECT bn = BayesNet(oriented_edge_dict, value_dict) return BN else: return Mb[_T]