def build_bayesnet(graph, data): directed = direct_edges(graph) edge_dict = generate_edge_dict(directed) print(edge_dict) value_dict = generate_value_dict(data) print(value_dict) return BayesNet(edge_dict, value_dict)
def chow_liu(data,edges_only=False): """ Perform Chow-Liu structure learning algorithm over an entire dataset, and return the BN-tree. Arguments --------- *data* : a nested numpy array The data from which we will learn. It should be the entire dataset. Returns ------- *bn* : a BayesNet object The structure-learned BN. Effects ------- None Notes: Prim's algorithm or Kruskal's Remark: This code is wrong. Since once an edge i->j both not in vertex_cache, It will not be considerred any longer. Even later, when one of them, say i, is added to vertex_cache, apparently i->j would be a safe link, but won't be considerred, leading to lower weight spanning tree. ----- """ value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) n_rv = data.shape[1] edge_list = [(i,j,mi_test(data[:,(i,j)],chi2_test=False)) \ for i in xrange(n_rv) for j in xrange(i+1,n_rv)] edge_list.sort(key=operator.itemgetter(2), reverse=True) # sort by weight vertex_cache = {edge_list[0][0]} # start with first vertex.. mst = dict((rv, []) for rv in xrange(n_rv)) for i,j,w in edge_list: # since undirected, i->j and j-> is the same # and in edge_list, there are only i->j # since edge_list already sorted, when we encounter i->j, # it must be largest weight edge crossing the cut, thus safe edge if i in vertex_cache and j not in vertex_cache: mst[i].append(j) vertex_cache.add(j) elif i not in vertex_cache and j in vertex_cache: mst[j].append(i) vertex_cache.add(i) if edges_only == True: return mst, value_dict bn=BayesNet(mst,value_dict) return bn
def naive_bayes(data, target, estimator='mle'): """ Learn naive bayes model from data. The Naive Bayes model is a Tree-based model where all random variables have the same parent (the "target" variable). From a probabilistic standpoint, the implication of this model is that all random variables (i.e. features) are assumed to be conditionally independent of any other random variable, conditioned upon the single parent (target) variable. It turns out that this model performs quite well as a classifier, and can be used as such. Moreover, this model is quite fast and simple to learn/create from a computational standpoint. Note that this function not only learns the structure, but ALSO learns the parameters. Arguments --------- *data* : a nested numpy array *target* : an integer The target variable column in *data* Returns ------- *bn* : a BayesNet object, with the structure instantiated. Effects ------- None Notes ----- """ value_dict = dict(zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) edge_dict = {target:[v for v in value_dict if v!=target]} edge_dict.update(dict([(rv,[]) for rv in value_dict if rv!=target])) bn = BayesNet(edge_dict,value_dict) if estimator == 'bayes': bayes_estimator(bn,data) else: mle_estimator(bn,data) return bn
def chow_liu(data, edges_only=False): """ Perform Chow-Liu structure learning algorithm over an entire dataset, and return the BN-tree. Arguments --------- *data* : a nested numpy array The data from which we will learn. It should be the entire dataset. Returns ------- *bn* : a BayesNet object The structure-learned BN. Effects ------- None Notes ----- """ value_dict = dict( zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) n_rv = data.shape[1] edge_list = [(i,j,mi_test(data[:,(i,j)],chi2_test=False)) \ for i in xrange(n_rv) for j in xrange(i+1,n_rv)] edge_list.sort(key=operator.itemgetter(2), reverse=True) # sort by weight vertex_cache = {edge_list[0][0]} # start with first vertex.. mst = dict((rv, []) for rv in xrange(n_rv)) for i, j, w in edge_list: if i in vertex_cache and j not in vertex_cache: mst[i].append(j) vertex_cache.add(j) elif i not in vertex_cache and j in vertex_cache: mst[j].append(i) vertex_cache.add(i) if edges_only == True: return mst, value_dict bn = BayesNet(mst, value_dict) return bn
def read_mat(path, delim=' '): """ Read an adjacency matrix into a BayesNet object. NOTE: This is for reading the structure only, and therefore no parameters for the BayesNet object will be set - they must be learned by calling "mle_estimator" or "bayes_estimator" on the object. """ _V = [] _E = {} _F = {} with open(path, 'r') as f: for line in f: line = line.split(delim) rv = line[0] _E[rv] = [] bn = BayesNet(_E) return bn
def bridge(c_bn, f_bn, data): """ Make a Multi-Dimensional Bayesian Network by bridging two Bayesian network structures. This happens by placing edges from c_bn -> f_bn using a heuristic optimization procedure. This can be used to create a Multi-Dimensional Bayesian Network classifier from two already-learned Bayesian networks - one of which is a BN containing all the class variables, the other containing all the feature variables. Arguments --------- *c_bn* : a BayesNet object with known structure *f_bn* : a BayesNet object with known structure. Returns ------- *m_bn* : a merged/bridge BayesNet object, whose structure contains *c_bn*, *f_bn*, and some bridge edges between them. """ restrict = [] for u in c_bn: for v in f_bn: restrict.append((u,v)) # only allow edges from c_bn -> f_bn bridge_bn = hc_rr(data, restriction=restrict) m_bn = bridge_bn.E m_bn.update(c_bn.E) m_bn.update(f_bn.E) mbc_bn = BayesNet(E=m_bn)
def gs(data, alpha=0.05, feature_selection=None, debug=False): """ Perform growshink algorithm over dataset to learn Bayesian network structure. This algorithm is clearly a good candidate for numba JIT compilation... STEPS ----- 1. Compute Markov Blanket 2. Compute Graph Structure 3. Orient Edges 4. Remove Cycles 5. Reverse Edges 6. Propagate Directions Arguments --------- *data* : a nested numpy array Data from which you wish to learn structure *alpha* : a float Type I error rate for independence test Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- Speed Test: *** 5 variables, 624 observations *** - 63.7 ms """ n_rv = data.shape[1] data, value_dict = replace_strings(data, return_values=True) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list)), 'feature_selection must be only one value' _T = [feature_selection] # STEP 1 : COMPUTE MARKOV BLANKETS Mb = dict([(rv,[]) for rv in range(n_rv)]) for X in _T: S = [] grow_condition = True while grow_condition: grow_condition=False for Y in range(n_rv): if X!=Y and Y not in S: # if there exists some Y such that Y is dependent on X given S, # add Y to S cols = (X,Y) + tuple(S) pval = mi_test(data[:,cols]) if pval < alpha: # dependent grow_condition=True # dependent -> continue searching S.append(Y) shrink_condition = True while shrink_condition: TEMP_S = [] shrink_condition=False for Y in S: s_copy = copy(S) s_copy.remove(Y) # condition on S-{Y} # if X independent of Y given S-{Y}, leave Y out # if X dependent of Y given S-{Y}, keep it in cols = (X,Y) + tuple(s_copy) pval = mi_test(data[:,cols]) if pval < alpha: # dependent TEMP_S.append(Y) else: # independent -> condition searching shrink_condition=True Mb[X] = TEMP_S if debug: print('Markov Blanket for %s : %s' % (X, str(TEMP_S))) if feature_selection is None: # STEP 2: COMPUTE GRAPH STRUCTURE # i.e. Resolve Markov Blanket edge_dict = resolve_markov_blanket(Mb,data) if debug: print('Unoriented edge dict:\n %s' % str(edge_dict)) # STEP 3: ORIENT EDGES oriented_edge_dict = orient_edges_MB(edge_dict,Mb,data,alpha) if debug: print('Oriented edge dict:\n %s' % str(oriented_edge_dict)) # CREATE BAYESNET OBJECT bn=BayesNet(oriented_edge_dict,value_dict) return bn else: return Mb[_T]
def tabu(data, k=5, metric='AIC', max_iter=100, debug=False, restriction=None): """ Tabu search for score-based structure learning. The algorithm maintains a list called "tabu_list", which consists of 3-tuples, where the first two elements constitute the edge which is tabued, and the third element is a string - either 'Addition', 'Deletion', or 'Reversal' denoting the operation associated with the edge. Arguments --------- *data* : a nested numpy array The data from which the Bayesian network structure will be learned. *metric* : a string Which score metric to use. Options: - AIC - BIC / MDL - LL (log-likelihood) *max_iter* : an integer The maximum number of iterations of the hill-climbing algorithm to run. Note that the algorithm will terminate on its own if no improvement is made in a given iteration. *debug* : boolean Whether to print(the scores/moves of the) algorithm as its happening. *restriction* : a list of 2-tuples For MMHC algorithm, the list of allowable edge additions. Returns ------- *bn* : a BayesNet object """ nrow = data.shape[0] ncol = data.shape[1] names = range(ncol) # INITIALIZE NETWORK W/ NO EDGES # maintain children and parents dict for fast lookups c_dict = dict([(n, []) for n in names]) p_dict = dict([(n, []) for n in names]) # COMPUTE INITIAL LIKELIHOOD SCORE value_dict = dict([(n, np.unique(data[:, i])) for i, n in enumerate(names)]) bn = BayesNet(c_dict) mle_estimator(bn, data) max_score = info_score(bn, nrow, metric) tabu_list = [None] * k _iter = 0 improvement = True while improvement: improvement = False max_delta = 0 if debug: print('ITERATION: ', _iter) ### TEST ARC ADDITIONS ### for u in bn.nodes(): for v in bn.nodes(): # CHECK TABU LIST - can't delete an addition on the tabu list if (u, v, 'Deletion') not in tabu_list: # CHECK EDGE EXISTENCE AND CYCLICITY if v not in c_dict[u] and u != v and not would_cause_cycle( c_dict, u, v): # FOR MMHC ALGORITHM -> Edge Restrictions if restriction is None or (u, v) in restriction: # SCORE FOR 'V' -> gaining a parent old_cols = (v, ) + tuple( p_dict[v]) # without 'u' as parent mi_old = mutual_information(data[:, old_cols]) new_cols = old_cols + (u, ) # with'u' as parent mi_new = mutual_information(data[:, new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: if debug: print('Improved Arc Addition: ', (u, v)) print('Delta Score: ', delta_score) max_delta = delta_score max_operation = 'Addition' max_arc = (u, v) ### TEST ARC DELETIONS ### for u in bn.nodes(): for v in bn.nodes(): # CHECK TABU LIST - can't add back a deletion on the tabu list if (u, v, 'Addition') not in tabu_list: if v in c_dict[u]: # SCORE FOR 'V' -> losing a parent old_cols = (v, ) + tuple( p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:, old_cols]) new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:, new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: if debug: print('Improved Arc Deletion: ', (u, v)) print('Delta Score: ', delta_score) max_delta = delta_score max_operation = 'Deletion' max_arc = (u, v) ### TEST ARC REVERSALS ### for u in bn.nodes(): for v in bn.nodes(): # CHECK TABU LIST - can't reverse back a reversal on the tabu list if (u, v, 'Reversal') not in tabu_list: if v in c_dict[u] and not would_cause_cycle( c_dict, v, u, reverse=True): # SCORE FOR 'U' -> gaining 'v' as parent old_cols = (u, ) + tuple( p_dict[v]) # without 'v' as parent mi_old = mutual_information(data[:, old_cols]) new_cols = old_cols + (v, ) # with 'v' as parent mi_new = mutual_information(data[:, new_cols]) delta1 = nrow * (mi_old - mi_new) # SCORE FOR 'V' -> losing 'u' as parent old_cols = (v, ) + tuple( p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:, old_cols]) new_cols = tuple([u for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:, new_cols]) delta2 = nrow * (mi_old - mi_new) # COMBINED DELTA-SCORES delta_score = delta1 + delta2 if delta_score > max_delta: if debug: print('Improved Arc Reversal: ', (u, v)) print('Delta Score: ', delta_score) max_delta = delta_score max_operation = 'Reversal' max_arc = (u, v) ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ### if max_delta != 0: improvement = True u, v = max_arc if max_operation == 'Addition': if debug: print('ADDING: ', max_arc, '\n') c_dict[u].append(v) p_dict[v].append(u) tabu_list[_iter % 5] = (u, v, 'Addition') elif max_operation == 'Deletion': if debug: print('DELETING: ', max_arc, '\n') c_dict[u].remove(v) p_dict[v].remove(u) tabu_list[_iter % 5] = (u, v, 'Deletion') elif max_operation == 'Reversal': if debug: print('REVERSING: ', max_arc, '\n') c_dict[u].remove(v) p_dict[v].remove(u) c_dict[v].append(u) p_dict[u].append(v) tabu_list[_iter % 5] = (u, v, 'Reversal') else: if debug: print('No Improvement on Iter: ', _iter) ### TEST FOR MAX ITERATION ### _iter += 1 if _iter > max_iter: if debug: print('Max Iteration Reached') break # bn = BayesNet(c_dict) return bn
def read_bif(path): """ This function reads a .bif file into a BayesNet object. It's probably not the fastest or prettiest but it gets the job done. Arguments --------- *path* : a string The path Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- *V* : a list of strings *E* : a dict, where key = vertex, val = list of its children *F* : a dict, where key = rv, val = another dict with keys = 'parents', 'values', cpt' """ _parents = { } # key = vertex, value = list of vertices in the scope (includind itself) _cpt = {} # key = vertex, value = list (then numpy array) _vals = {} # key=vertex, val=list of its possible values with open(path, 'r') as f: while True: line = f.readline() if 'variable' in line: new_vertex = line.split()[1] _parents[new_vertex] = [] _cpt[new_vertex] = [] #_vals[new_vertex] = [] new_line = f.readline() new_vals = new_line.replace(',', ' ').split()[6:-1] # list of vals _vals[new_vertex] = new_vals num_outcomes = len(new_vals) elif 'probability' in line: line = line.replace(',', ' ') child_rv = line.split()[2] parent_rvs = line.split()[4:-2] if len(parent_rvs) == 0: # prior new_line = f.readline().replace(';', ' ').replace(',', ' ').split() prob_values = new_line[1:] _cpt[child_rv].append(map(float, prob_values)) #_cpt[child_rv] = map(float,prob_values) else: # not a prior _parents[child_rv].extend(list(parent_rvs)) while True: new_line = f.readline() if '}' in new_line: break new_line = new_line.replace(',', ' ').replace( ';', ' ').replace('(', ' ').replace(')', ' ').split() prob_values = new_line[-(len(_vals[new_vertex])):] prob_values = map(float, prob_values) _cpt[child_rv].append(prob_values) if line == '': break # CREATE FACTORS _F = {} _E = {} for rv in _vals.keys(): _E[rv] = [c for c in _vals.keys() if rv in _parents[c]] f = { 'parents': _parents[rv], 'values': _vals[rv], 'cpt': [item for sublist in _cpt[rv] for item in sublist] } _F[rv] = f bn = BayesNet() bn.F = _F bn.E = _E bn.V = list(topsort(_E)) return bn
def read_json(path): """ Read a BayesNet object from the json format. This format has the ".bn" extension and is completely unique to pyBN. Arguments --------- *path* : a string The file path Returns ------- None Effects ------- - Instantiates and sets a new BayesNet object Notes ----- This function reads in a libpgm-style format into a bn object File Format: { "V": ["Letter", "Grade", "Intelligence", "SAT", "Difficulty"], "E": [["Intelligence", "Grade"], ["Difficulty", "Grade"], ["Intelligence", "SAT"], ["Grade", "Letter"]], "Vdata": { "Letter": { "ord": 4, "numoutcomes": 2, "vals": ["weak", "strong"], "parents": ["Grade"], "children": None, "cprob": [[.1, .9],[.4, .6],[.99, .01]] }, ... } """ def byteify(input): if isinstance(input, dict): return {byteify(key):byteify(value) for key,value in input.iteritems()} elif isinstance(input, list): return [byteify(element) for element in input] elif isinstance(input, unicode): return input.encode('utf-8') else: return input bn = BayesNet() f = open(path,'r') ftxt = f.read() success=False try: data = byteify(json.loads(ftxt)) bn.V = data['V'] bn.E = data['E'] bn.F = data['F'] success = True except ValueError: print "Could not read file - check format" bn.V = topsort(bn.E) return bn
def hc_rr(data, M=5, R=3, metric='AIC', max_iter=100, debug=False, restriction=None): """ Arguments --------- *data* : a nested numpy array The data from which the Bayesian network structure will be learned. *metric* : a string Which score metric to use. Options: - AIC - BIC / MDL - LL (log-likelihood) *max_iter* : an integer The maximum number of iterations of the hill-climbing algorithm to run. Note that the algorithm will terminate on its own if no improvement is made in a given iteration. *debug* : boolean Whether to print(the scores/moves of the) algorithm as its happening. *restriction* : a list of 2-tuples For MMHC algorithm, the list of allowable edge additions. Returns ------- *bn* : a BayesNet object """ nrow = data.shape[0] ncol = data.shape[1] names = range(ncol) # INITIALIZE NETWORK W/ NO EDGES # maintain children and parents dict for fast lookups c_dict = dict([(n, []) for n in names]) p_dict = dict([(n, []) for n in names]) # COMPUTE INITIAL LIKELIHOOD SCORE value_dict = dict([(n, np.unique(data[:, i])) for i, n in enumerate(names)]) bn = BayesNet(c_dict) mle_estimator(bn, data) max_score = info_score(bn, nrow, metric) _iter = 0 improvement = True _restarts = 0 while improvement: improvement = False max_delta = 0 if debug: print('ITERATION: ', _iter) ### TEST ARC ADDITIONS ### for u in bn.nodes(): for v in bn.nodes(): if v not in c_dict[u] and u != v and not would_cause_cycle( c_dict, u, v): # FOR MMHC ALGORITHM -> Edge Restrictions if restriction is None or (u, v) in restriction: # SCORE FOR 'V' -> gaining a parent old_cols = (v, ) + tuple( p_dict[v]) # without 'u' as parent mi_old = mutual_information(data[:, old_cols]) new_cols = old_cols + (u, ) # with'u' as parent mi_new = mutual_information(data[:, new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: if debug: print('Improved Arc Addition: ', (u, v)) print('Delta Score: ', delta_score) max_delta = delta_score max_operation = 'Addition' max_arc = (u, v) ### TEST ARC DELETIONS ### for u in bn.nodes(): for v in bn.nodes(): if v in c_dict[u]: # SCORE FOR 'V' -> losing a parent old_cols = (v, ) + tuple(p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:, old_cols]) new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:, new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: if debug: print('Improved Arc Deletion: ', (u, v)) print('Delta Score: ', delta_score) max_delta = delta_score max_operation = 'Deletion' max_arc = (u, v) ### TEST ARC REVERSALS ### for u in bn.nodes(): for v in bn.nodes(): if v in c_dict[u] and not would_cause_cycle( c_dict, v, u, reverse=True): # SCORE FOR 'U' -> gaining 'v' as parent old_cols = (u, ) + tuple( p_dict[v]) # without 'v' as parent mi_old = mutual_information(data[:, old_cols]) new_cols = old_cols + (v, ) # with 'v' as parent mi_new = mutual_information(data[:, new_cols]) delta1 = nrow * (mi_old - mi_new) # SCORE FOR 'V' -> losing 'u' as parent old_cols = (v, ) + tuple(p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:, old_cols]) new_cols = tuple([u for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:, new_cols]) delta2 = nrow * (mi_old - mi_new) # COMBINED DELTA-SCORES delta_score = delta1 + delta2 if delta_score > max_delta: if debug: print('Improved Arc Reversal: ', (u, v)) print('Delta Score: ', delta_score) max_delta = delta_score max_operation = 'Reversal' max_arc = (u, v) ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ### if max_delta != 0: improvement = True u, v = max_arc if max_operation == 'Addition': if debug: print('ADDING: ', max_arc, '\n') c_dict[u].append(v) p_dict[v].append(u) elif max_operation == 'Deletion': if debug: print('DELETING: ', max_arc, '\n') c_dict[u].remove(v) p_dict[v].remove(u) elif max_operation == 'Reversal': if debug: print('REVERSING: ', max_arc, '\n') c_dict[u].remove(v) p_dict[v].remove(u) c_dict[v].append(u) p_dict[u].append(v) else: if debug: print('No Improvement on Iter: ', _iter) #### RESTART WITH RANDOM MOVES #### if _restarts < R: improvement = True # make another pass of hill climbing _iter = 0 # reset iterations if debug: print('Restart - ', _restarts) _restarts += 1 for _ in range(M): # 0 = Addition, 1 = Deletion, 2 = Reversal operation = np.random.choice([0, 1, 2]) if operation == 0: while True: u, v = np.random.choice(list(bn.nodes()), size=2, replace=False) # IF EDGE DOESN'T EXIST, ADD IT if u not in p_dict[ v] and u != v and not would_cause_cycle( c_dict, u, v): if debug: print('RESTART - ADDING: ', (u, v)) c_dict[u].append(v) p_dict[v].append(u) break elif operation == 1: while True: u, v = np.random.choice(list(bn.nodes()), size=2, replace=False) # IF EDGE EXISTS, DELETE IT if u in p_dict[v]: if debug: print('RESTART - DELETING: ', (u, v)) c_dict[u].remove(v) p_dict[v].remove(u) break elif operation == 2: while True: u, v = np.random.choice(list(bn.nodes()), size=2, replace=False) # IF EDGE EXISTS, REVERSE IT if u in p_dict[v] and not would_cause_cycle( c_dict, v, u, reverse=True): if debug: print('RESTART - REVERSING: ', (u, v)) c_dict[u].remove(v) p_dict[v].remove(u) c_dict[v].append(u) p_dict[u].append(v) break ### TEST FOR MAX ITERATION ### _iter += 1 if _iter > max_iter: if debug: print('Max Iteration Reached') break # bn = BayesNet(c_dict) return c_dict
def hc(self, metric='AIC', max_iter=100, debug=False, restriction=None, whitelist=None): """ Greedy Hill Climbing search proceeds by choosing the move which maximizes the increase in fitness of the network at the current step. It continues until it reaches a point where there does not exist any feasible single move that increases the network fitness. It is called "greedy" because it simply does what is best at the current iteration only, and thus does not look ahead to what may be better later on in the search. For computational saving, a Priority Queue (python's heapq) can be used to maintain the best operators and reduce the complexity of picking the best operator from O(n^2) to O(nlogn). This works by maintaining the heapq of operators sorted by their delta score, and each time a move is made, we only have to recompute the O(n) delta-scores which were affected by the move. The rest of the operator delta-scores are not affected. For additional computational efficiency, we can cache the sufficient statistics for various families of distributions - therefore, computing the mutual information for a given family only needs to happen once. The possible moves are the following: - add edge - delete edge - invert edge Arguments --------- *data* : a nested numpy array The data from which the Bayesian network structure will be learned. *metric* : a string Which score metric to use. Options: - AIC - BIC / MDL - LL (log-likelihood) *max_iter* : an integer The maximum number of iterations of the hill-climbing algorithm to run. Note that the algorithm will terminate on its own if no improvement is made in a given iteration. *debug* : boolean Whether to print the scores/moves of the algorithm as its happening. *restriction* : a list of 2-tuples For MMHC algorithm, the list of allowable edge additions. Returns ------- *bn* : a BayesNet object """ # INITIALIZE NETWORK W/ NO EDGES # maintain children and parents dict for fast lookups self.c_dict = dict([(n, []) for n in self.nodes]) self.p_dict = dict([(n, []) for n in self.nodes]) self.restriction = restriction self.whitelist = whitelist if whitelist is None: self.whitelist = [] for (u, v) in self.whitelist: if u in self.c_dict: self.c_dict[u].append(v) if v in self.p_dict: self.p_dict[v].append(u) print("Whitelist", self.whitelist) self.bn = BayesNet(self.c_dict) # COMPUTE INITIAL LIKELIHOOD SCORE print("Nodes:", list(self.bn.nodes())) # We do not take the complexity into account for Continuous Variables score = model_score( self.data, self.bn) # - model_complexity(self.bn, self.nrow, metric) print("Initial Score:", score) # CREATE EMPIRICAL DISTRIBUTION OBJECT FOR CACHING #ED = EmpiricalDistribution(data,names) _iter = 0 improvement = True man = Manager() mut_inf_cache = man.dict() configs_cache = man.dict() x = [] y = [] while improvement: x.append(_iter) y.append(score) start_t = time.time() improvement = False max_delta = 0 max_operation = None if debug: print('ITERATION: ', _iter) return_queue = Queue() p_add = Process(target=self.test_arc_additions, args=(configs_cache, mut_inf_cache, return_queue)) p_rem = Process(target=self.test_arc_deletions, args=(configs_cache, mut_inf_cache, return_queue)) #p_rev = Process(target=self.test_arc_reversals, args=(configs_cache, mut_inf_cache, return_queue)) p_add.start() p_rem.start() #p_rev.start() p_add.join() p_rem.join() #p_rev.join() while not return_queue.empty(): results = return_queue.get() if results[1] > max_delta: max_arc = results[0] max_delta = results[1] max_operation = results[2] ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ### if max_operation: score += max_delta improvement = True u, v = max_arc str_arc = [e for e in max_arc] if max_operation == 'Addition': if debug: print("delta:", max_delta) print('ADDING: ', str_arc, '\n') self.p_dict[v].append(u) self.bn.add_edge(u, v) elif max_operation == 'Deletion': if debug: print("delta:", max_delta) print('DELETING: ', str_arc, '\n') self.p_dict[v].remove(u) self.bn.remove_edge(u, v) elif max_operation == 'Reversal': if debug: print("delta:", max_delta) print('REVERSING: ', str_arc, '\n') self.p_dict[v].remove(u) self.bn.remove_edge(u, v) self.p_dict[u].append(v) self.bn.add_edge(v, u) print("Model score:", score ) # TODO: improve so only changed elements get an update else: if debug: print('No Improvement on Iter: ', _iter) print("Time for iteration:", time.time() - start_t) ### TEST FOR MAX ITERATION ### _iter += 1 # if _iter > max_iter: # if debug: # print('Max Iteration Reached') # break bn = BayesNet(self.c_dict) print("Size of Cache", len(mut_inf_cache)) print("SCORE =", score) plt.plot(x, y) plt.show() return bn
class hill_climbing: def __init__(self, data, nodes): self.data = data self.nodes = nodes self.nrow = len(self.data) self.ncol = len(self.nodes) self.names = range(self.ncol) # From Density Estimation for Statistics and Data Analysis, Bernard. W. Silverman, CRC ,1986 # (chapter Required sample size for given accuracy) self.sample_size = [ 4, 19, 67, 223, 768, 2790, 10700, 43700, 187000, 842000 ] def hc(self, metric='AIC', max_iter=100, debug=False, restriction=None, whitelist=None): """ Greedy Hill Climbing search proceeds by choosing the move which maximizes the increase in fitness of the network at the current step. It continues until it reaches a point where there does not exist any feasible single move that increases the network fitness. It is called "greedy" because it simply does what is best at the current iteration only, and thus does not look ahead to what may be better later on in the search. For computational saving, a Priority Queue (python's heapq) can be used to maintain the best operators and reduce the complexity of picking the best operator from O(n^2) to O(nlogn). This works by maintaining the heapq of operators sorted by their delta score, and each time a move is made, we only have to recompute the O(n) delta-scores which were affected by the move. The rest of the operator delta-scores are not affected. For additional computational efficiency, we can cache the sufficient statistics for various families of distributions - therefore, computing the mutual information for a given family only needs to happen once. The possible moves are the following: - add edge - delete edge - invert edge Arguments --------- *data* : a nested numpy array The data from which the Bayesian network structure will be learned. *metric* : a string Which score metric to use. Options: - AIC - BIC / MDL - LL (log-likelihood) *max_iter* : an integer The maximum number of iterations of the hill-climbing algorithm to run. Note that the algorithm will terminate on its own if no improvement is made in a given iteration. *debug* : boolean Whether to print the scores/moves of the algorithm as its happening. *restriction* : a list of 2-tuples For MMHC algorithm, the list of allowable edge additions. Returns ------- *bn* : a BayesNet object """ # INITIALIZE NETWORK W/ NO EDGES # maintain children and parents dict for fast lookups self.c_dict = dict([(n, []) for n in self.nodes]) self.p_dict = dict([(n, []) for n in self.nodes]) self.restriction = restriction self.whitelist = whitelist if whitelist is None: self.whitelist = [] for (u, v) in self.whitelist: if u in self.c_dict: self.c_dict[u].append(v) if v in self.p_dict: self.p_dict[v].append(u) print("Whitelist", self.whitelist) self.bn = BayesNet(self.c_dict) # COMPUTE INITIAL LIKELIHOOD SCORE print("Nodes:", list(self.bn.nodes())) # We do not take the complexity into account for Continuous Variables score = model_score( self.data, self.bn) # - model_complexity(self.bn, self.nrow, metric) print("Initial Score:", score) # CREATE EMPIRICAL DISTRIBUTION OBJECT FOR CACHING #ED = EmpiricalDistribution(data,names) _iter = 0 improvement = True man = Manager() mut_inf_cache = man.dict() configs_cache = man.dict() x = [] y = [] while improvement: x.append(_iter) y.append(score) start_t = time.time() improvement = False max_delta = 0 max_operation = None if debug: print('ITERATION: ', _iter) return_queue = Queue() p_add = Process(target=self.test_arc_additions, args=(configs_cache, mut_inf_cache, return_queue)) p_rem = Process(target=self.test_arc_deletions, args=(configs_cache, mut_inf_cache, return_queue)) #p_rev = Process(target=self.test_arc_reversals, args=(configs_cache, mut_inf_cache, return_queue)) p_add.start() p_rem.start() #p_rev.start() p_add.join() p_rem.join() #p_rev.join() while not return_queue.empty(): results = return_queue.get() if results[1] > max_delta: max_arc = results[0] max_delta = results[1] max_operation = results[2] ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ### if max_operation: score += max_delta improvement = True u, v = max_arc str_arc = [e for e in max_arc] if max_operation == 'Addition': if debug: print("delta:", max_delta) print('ADDING: ', str_arc, '\n') self.p_dict[v].append(u) self.bn.add_edge(u, v) elif max_operation == 'Deletion': if debug: print("delta:", max_delta) print('DELETING: ', str_arc, '\n') self.p_dict[v].remove(u) self.bn.remove_edge(u, v) elif max_operation == 'Reversal': if debug: print("delta:", max_delta) print('REVERSING: ', str_arc, '\n') self.p_dict[v].remove(u) self.bn.remove_edge(u, v) self.p_dict[u].append(v) self.bn.add_edge(v, u) print("Model score:", score ) # TODO: improve so only changed elements get an update else: if debug: print('No Improvement on Iter: ', _iter) print("Time for iteration:", time.time() - start_t) ### TEST FOR MAX ITERATION ### _iter += 1 # if _iter > max_iter: # if debug: # print('Max Iteration Reached') # break bn = BayesNet(self.c_dict) print("Size of Cache", len(mut_inf_cache)) print("SCORE =", score) plt.plot(x, y) plt.show() return bn def test_arc_reversals(self, configs_cache, mut_inf_cache, return_queue): print("Test Reversals") ### TEST ARC REVERSALS ### max_delta = 0 max_operation = None max_arc = None max_qi = 0 for u in self.bn.nodes(): for v in self.c_dict[u]: if not would_cause_cycle(self.c_dict, v, u, reverse=True) and ( self.restriction is None or (v, u) in self.restriction): # and ( # self.whitelist is None or (u,v) not in self.whitelist): # SCORE FOR 'U' -> gaining 'v' as parent old_cols = (u, ) + tuple( self.p_dict[u]) # without 'v' as parent if old_cols not in mut_inf_cache: mut_inf_cache[old_cols] = mutual_information( self.data[list(old_cols)]) mi_old = mut_inf_cache[old_cols] new_cols = old_cols + (v, ) # with 'v' as parent if new_cols not in mut_inf_cache: mut_inf_cache[new_cols] = mutual_information( self.data[list(new_cols)]) mi_new = mut_inf_cache[new_cols] delta1 = self.nrow * ( mi_new - mi_old ) # Add difference in complexity -> recalculate qi for node v # SCORE FOR 'V' -> losing 'u' as parent old_cols = (v, ) + tuple( self.p_dict[v]) # with 'u' as parent if old_cols not in mut_inf_cache: mut_inf_cache[old_cols] = mutual_information( self.data[list(old_cols)]) mi_old = mut_inf_cache[old_cols] new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent if new_cols not in mut_inf_cache: mut_inf_cache[new_cols] = mutual_information( self.data[list(new_cols)]) mi_new = mut_inf_cache[new_cols] delta2 = self.nrow * ( mi_new - mi_old ) # Add difference in complexity -> recalculate qi for node v # COMBINED DELTA-SCORES ri1 = self.bn.F[u]['ri'] qi1 = self.bn.F[u]['qi'] qi_new1 = calc_num_parent_configs(self.data, self.bn.parents(u) + [v], configs_cache) ri2 = self.bn.F[v]['ri'] qi2 = self.bn.F[v]['qi'] qi_new2 = calc_num_parent_configs( self.data, [x for x in self.bn.parents(v) if x != u], configs_cache) delta_score = delta1 + delta2 - ( ri2 * (qi_new2 - qi2) - (qi_new2 - qi2) ) - ( ri1 * (qi_new1 - qi1) - (qi_new1 - qi1) ) # Add difference in complexity -> recalculate qi for node u and v if delta_score - max_delta > 10**(-10): max_delta = delta_score max_operation = 'Reversal' max_arc = (u, v) max_qi = (qi_new1, qi_new2) return_queue.put((max_arc, max_delta, max_operation, max_qi)) def test_arc_deletions(self, configs_cache, l_inf_cache, return_queue): print("Test Deletions") ### TEST ARC DELETIONS ### max_delta = 0 max_operation = None max_arc = None max_qi = 0 for u in self.bn.nodes(): for v in [ n for n in self.c_dict[u] if (u, n) not in self.whitelist ]: #if (u,v) not in self.whitelist: # SCORE FOR 'V' -> losing a parent old_cols = (v, ) + tuple(self.p_dict[v]) # with 'u' as parent if old_cols not in l_inf_cache: l_inf_cache[old_cols] = calc_score(self.data, old_cols) old_cols2 = tuple(self.p_dict[v]) if old_cols2 not in l_inf_cache: l_inf_cache[old_cols2] = calc_score(self.data, old_cols2) l_old = l_inf_cache[old_cols] - l_inf_cache[old_cols2] new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent if len(new_cols) == 1: if new_cols not in l_inf_cache: l_inf_cache[new_cols] = calc_score(self.data, new_cols) l_new = l_inf_cache[new_cols] else: if new_cols not in l_inf_cache: l_inf_cache[new_cols] = calc_score(self.data, new_cols) if tuple([n for n in self.p_dict[v] if n != u]) not in l_inf_cache: l_inf_cache[tuple(self.p_dict[v])] = calc_score( self.data, self.p_dict[v]) l_new = l_inf_cache[old_cols] - l_inf_cache[tuple( self.p_dict[v])] delta_score = ( l_new - l_old ) #- self.sample_size[min(len(new_cols), len(self.sample_size))] if delta_score - max_delta > 10**(-10): max_delta = delta_score max_operation = 'Deletion' max_arc = (u, v) return_queue.put((max_arc, max_delta, max_operation, max_qi)) def test_arc_additions(self, configs_cache, l_inf_cache, return_queue): print("Test Additions") ### TEST ARC ADDITIONS ### max_delta = 0 max_operation = None max_arc = None procs = [] result_queue = Queue() for u in self.bn.nodes(): p = Process(target=self.test_arcs, args=(configs_cache, l_inf_cache, u, result_queue)) procs.append(p) p.start() for p in procs: p.join() while not result_queue.empty(): results = result_queue.get() if results[1] - max_delta > 10**(-10): max_arc = results[0] max_delta = results[1] max_operation = results[2] return_queue.put((max_arc, max_delta, max_operation)) def test_arcs(self, configs_cache, l_inf_cache, u, result_queue): max_delta = 0 max_operation = None max_arc = None for v in [ n for n in self.bn.nodes() if u != n and n not in self.c_dict[u] and not would_cause_cycle(self.c_dict, u, n) ]: # FOR MMHC ALGORITHM -> Edge Restrictions if self.restriction is None or (u, v) in self.restriction: # SCORE FOR 'V' -> gaining a parent old_cols = (v, ) + tuple( self.p_dict[v]) # without 'u' as parent if len(old_cols) == 1: if old_cols not in l_inf_cache: l_inf_cache[old_cols] = calc_score(self.data, old_cols) l_old = l_inf_cache[old_cols] else: if old_cols not in l_inf_cache: l_inf_cache[old_cols] = calc_score(self.data, old_cols) if tuple(self.p_dict[v]) not in l_inf_cache: l_inf_cache[tuple(self.p_dict[v])] = calc_score( self.data, self.p_dict[v]) l_old = l_inf_cache[old_cols] - l_inf_cache[tuple( self.p_dict[v])] new_cols = old_cols + (u, ) # with'u' as parent if new_cols not in l_inf_cache: l_inf_cache[new_cols] = calc_score(self.data, new_cols) new_cols2 = tuple(self.p_dict[v]) + (u, ) if new_cols2 not in l_inf_cache: l_inf_cache[new_cols2] = calc_score(self.data, new_cols2) l_new = l_inf_cache[new_cols] - l_inf_cache[new_cols2] delta_score = (l_new - l_old) - self.sample_size[min( len(new_cols), len(self.sample_size))] if delta_score - max_delta > 10**(-10): max_delta = delta_score max_operation = 'Addition' max_arc = (u, v) result_queue.put((max_arc, max_delta, max_operation))
def hc_rr(data, M=5, R=3, metric='AIC', max_iter=100, debug=False, restriction=None): """ Arguments --------- *data* : a nested numpy array The data from which the Bayesian network structure will be learned. *metric* : a string Which score metric to use. Options: - AIC - BIC / MDL - LL (log-likelihood) *max_iter* : an integer The maximum number of iterations of the hill-climbing algorithm to run. Note that the algorithm will terminate on its own if no improvement is made in a given iteration. *debug* : boolean Whether to print the scores/moves of the algorithm as its happening. *restriction* : a list of 2-tuples For MMHC algorithm, the list of allowable edge additions. Returns ------- *bn* : a BayesNet object """ nrow = data.shape[0] ncol = data.shape[1] names = range(ncol) # INITIALIZE NETWORK W/ NO EDGES # maintain children and parents dict for fast lookups c_dict = dict([(n,[]) for n in names]) p_dict = dict([(n,[]) for n in names]) # COMPUTE INITIAL LIKELIHOOD SCORE value_dict = dict([(n, np.unique(data[:,i])) for i,n in enumerate(names)]) bn = BayesNet(c_dict) mle_estimator(bn, data) max_score = info_score(bn, nrow, metric) _iter = 0 improvement = True _restarts = 0 while improvement: improvement = False max_delta = 0 if debug: print 'ITERATION: ' , _iter ### TEST ARC ADDITIONS ### for u in bn.nodes(): for v in bn.nodes(): if v not in c_dict[u] and u!=v and not would_cause_cycle(c_dict, u, v): # FOR MMHC ALGORITHM -> Edge Restrictions if restriction is None or (u,v) in restriction: # SCORE FOR 'V' -> gaining a parent old_cols = (v,) + tuple(p_dict[v]) # without 'u' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = old_cols + (u,) # with'u' as parent mi_new = mutual_information(data[:,new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: if debug: print 'Improved Arc Addition: ' , (u,v) print 'Delta Score: ' , delta_score max_delta = delta_score max_operation = 'Addition' max_arc = (u,v) ### TEST ARC DELETIONS ### for u in bn.nodes(): for v in bn.nodes(): if v in c_dict[u]: # SCORE FOR 'V' -> losing a parent old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:,new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: if debug: print 'Improved Arc Deletion: ' , (u,v) print 'Delta Score: ' , delta_score max_delta = delta_score max_operation = 'Deletion' max_arc = (u,v) ### TEST ARC REVERSALS ### for u in bn.nodes(): for v in bn.nodes(): if v in c_dict[u] and not would_cause_cycle(c_dict,v,u, reverse=True): # SCORE FOR 'U' -> gaining 'v' as parent old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = old_cols + (v,) # with 'v' as parent mi_new = mutual_information(data[:,new_cols]) delta1 = nrow * (mi_old - mi_new) # SCORE FOR 'V' -> losing 'u' as parent old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = tuple([u for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:,new_cols]) delta2 = nrow * (mi_old - mi_new) # COMBINED DELTA-SCORES delta_score = delta1 + delta2 if delta_score > max_delta: if debug: print 'Improved Arc Reversal: ' , (u,v) print 'Delta Score: ' , delta_score max_delta = delta_score max_operation = 'Reversal' max_arc = (u,v) ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ### if max_delta != 0: improvement = True u,v = max_arc if max_operation == 'Addition': if debug: print 'ADDING: ' , max_arc , '\n' c_dict[u].append(v) p_dict[v].append(u) elif max_operation == 'Deletion': if debug: print 'DELETING: ' , max_arc , '\n' c_dict[u].remove(v) p_dict[v].remove(u) elif max_operation == 'Reversal': if debug: print 'REVERSING: ' , max_arc, '\n' c_dict[u].remove(v) p_dict[v].remove(u) c_dict[v].append(u) p_dict[u].append(v) else: if debug: print 'No Improvement on Iter: ' , _iter #### RESTART WITH RANDOM MOVES #### if _restarts < R: improvement = True # make another pass of hill climbing _iter=0 # reset iterations if debug: print 'Restart - ' , _restarts _restarts+=1 for _ in range(M): # 0 = Addition, 1 = Deletion, 2 = Reversal operation = np.random.choice([0,1,2]) if operation == 0: while True: u,v = np.random.choice(list(bn.nodes()), size=2, replace=False) # IF EDGE DOESN'T EXIST, ADD IT if u not in p_dict[v] and u!=v and not would_cause_cycle(c_dict,u,v): if debug: print 'RESTART - ADDING: ', (u,v) c_dict[u].append(v) p_dict[v].append(u) break elif operation == 1: while True: u,v = np.random.choice(list(bn.nodes()), size=2, replace=False) # IF EDGE EXISTS, DELETE IT if u in p_dict[v]: if debug: print 'RESTART - DELETING: ', (u,v) c_dict[u].remove(v) p_dict[v].remove(u) break elif operation == 2: while True: u,v = np.random.choice(list(bn.nodes()), size=2, replace=False) # IF EDGE EXISTS, REVERSE IT if u in p_dict[v] and not would_cause_cycle(c_dict,v,u, reverse=True): if debug: print 'RESTART - REVERSING: ', (u,v) c_dict[u].remove(v) p_dict[v].remove(u) c_dict[v].append(u) p_dict[u].append(v) break ### TEST FOR MAX ITERATION ### _iter += 1 if _iter > max_iter: if debug: print 'Max Iteration Reached' break bn = BayesNet(c_dict) return bn
def hc(data, metric='AIC', max_iter=100, debug=False, restriction=None): """ Greedy Hill Climbing search proceeds by choosing the move which maximizes the increase in fitness of the network at the current step. It continues until it reaches a point where there does not exist any feasible single move that increases the network fitness. It is called "greedy" because it simply does what is best at the current iteration only, and thus does not look ahead to what may be better later on in the search. For computational saving, a Priority Queue (python's heapq) can be used to maintain the best operators and reduce the complexity of picking the best operator from O(n^2) to O(nlogn). This works by maintaining the heapq of operators sorted by their delta score, and each time a move is made, we only have to recompute the O(n) delta-scores which were affected by the move. The rest of the operator delta-scores are not affected. For additional computational efficiency, we can cache the sufficient statistics for various families of distributions - therefore, computing the mutual information for a given family only needs to happen once. The possible moves are the following: - add edge - delete edge - invert edge Arguments --------- *data* : a nested numpy array The data from which the Bayesian network structure will be learned. *metric* : a string Which score metric to use. Options: - AIC - BIC / MDL - LL (log-likelihood) *max_iter* : an integer The maximum number of iterations of the hill-climbing algorithm to run. Note that the algorithm will terminate on its own if no improvement is made in a given iteration. *debug* : boolean Whether to print the scores/moves of the algorithm as its happening. *restriction* : a list of 2-tuples For MMHC algorithm, the list of allowable edge additions. Returns ------- *bn* : a BayesNet object """ nrow = data.shape[0] ncol = data.shape[1] names = range(ncol) # INITIALIZE NETWORK W/ NO EDGES # maintain children and parents dict for fast lookups c_dict = dict([(n,[]) for n in names]) p_dict = dict([(n,[]) for n in names]) # COMPUTE INITIAL LIKELIHOOD SCORE value_dict = dict([(n, np.unique(data[:,i])) for i,n in enumerate(names)]) bn = BayesNet(c_dict) mle_estimator(bn, data) max_score = info_score(bn, nrow, metric) # CREATE EMPIRICAL DISTRIBUTION OBJECT FOR CACHING #ED = EmpiricalDistribution(data,names) _iter = 0 improvement = True while improvement: improvement = False max_delta = 0 if debug: print 'ITERATION: ' , _iter ### TEST ARC ADDITIONS ### for u in bn.nodes(): for v in bn.nodes(): if v not in c_dict[u] and u!=v and not would_cause_cycle(c_dict, u, v): # FOR MMHC ALGORITHM -> Edge Restrictions if restriction is None or (u,v) in restriction: # SCORE FOR 'V' -> gaining a parent old_cols = (v,) + tuple(p_dict[v]) # without 'u' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = old_cols + (u,) # with'u' as parent mi_new = mutual_information(data[:,new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: #if debug: # print 'Improved Arc Addition: ' , (u,v) # print 'Delta Score: ' , delta_score max_delta = delta_score max_operation = 'Addition' max_arc = (u,v) ### TEST ARC DELETIONS ### for u in bn.nodes(): for v in bn.nodes(): if v in c_dict[u]: # SCORE FOR 'V' -> losing a parent old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:,new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: #if debug: # print 'Improved Arc Deletion: ' , (u,v) # print 'Delta Score: ' , delta_score max_delta = delta_score max_operation = 'Deletion' max_arc = (u,v) ### TEST ARC REVERSALS ### for u in bn.nodes(): for v in bn.nodes(): if v in c_dict[u] and not would_cause_cycle(c_dict,v,u, reverse=True): # SCORE FOR 'U' -> gaining 'v' as parent old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = old_cols + (v,) # with 'v' as parent mi_new = mutual_information(data[:,new_cols]) delta1 = nrow * (mi_old - mi_new) # SCORE FOR 'V' -> losing 'u' as parent old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = tuple([u for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:,new_cols]) delta2 = nrow * (mi_old - mi_new) # COMBINED DELTA-SCORES delta_score = delta1 + delta2 if delta_score > max_delta: #if debug: # print 'Improved Arc Reversal: ' , (u,v) # print 'Delta Score: ' , delta_score max_delta = delta_score max_operation = 'Reversal' max_arc = (u,v) ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ### if max_delta != 0: improvement = True u,v = max_arc if max_operation == 'Addition': if debug: print 'ADDING: ' , max_arc , '\n' c_dict[u].append(v) p_dict[v].append(u) elif max_operation == 'Deletion': if debug: print 'DELETING: ' , max_arc , '\n' c_dict[u].remove(v) p_dict[v].remove(u) elif max_operation == 'Reversal': if debug: print 'REVERSING: ' , max_arc, '\n' c_dict[u].remove(v) p_dict[v].remove(u) c_dict[v].append(u) p_dict[u].append(v) else: if debug: print 'No Improvement on Iter: ' , _iter ### TEST FOR MAX ITERATION ### _iter += 1 if _iter > max_iter: if debug: print 'Max Iteration Reached' break bn = BayesNet(c_dict) return bn
def tabu(data, k=5, metric='AIC', max_iter=100, debug=False, restriction=None): """ Tabu search for score-based structure learning. The algorithm maintains a list called "tabu_list", which consists of 3-tuples, where the first two elements constitute the edge which is tabued, and the third element is a string - either 'Addition', 'Deletion', or 'Reversal' denoting the operation associated with the edge. Arguments --------- *data* : a nested numpy array The data from which the Bayesian network structure will be learned. *metric* : a string Which score metric to use. Options: - AIC - BIC / MDL - LL (log-likelihood) *max_iter* : an integer The maximum number of iterations of the hill-climbing algorithm to run. Note that the algorithm will terminate on its own if no improvement is made in a given iteration. *debug* : boolean Whether to print the scores/moves of the algorithm as its happening. *restriction* : a list of 2-tuples For MMHC algorithm, the list of allowable edge additions. Returns ------- *bn* : a BayesNet object """ nrow = data.shape[0] ncol = data.shape[1] names = range(ncol) # INITIALIZE NETWORK W/ NO EDGES # maintain children and parents dict for fast lookups c_dict = dict([(n,[]) for n in names]) p_dict = dict([(n,[]) for n in names]) # COMPUTE INITIAL LIKELIHOOD SCORE value_dict = dict([(n, np.unique(data[:,i])) for i,n in enumerate(names)]) bn = BayesNet(c_dict) mle_estimator(bn, data) max_score = info_score(bn, nrow, metric) tabu_list = [None]*k _iter = 0 improvement = True while improvement: improvement = False max_delta = 0 if debug: print 'ITERATION: ' , _iter ### TEST ARC ADDITIONS ### for u in bn.nodes(): for v in bn.nodes(): # CHECK TABU LIST - can't delete an addition on the tabu list if (u,v,'Deletion') not in tabu_list: # CHECK EDGE EXISTENCE AND CYCLICITY if v not in c_dict[u] and u!=v and not would_cause_cycle(c_dict, u, v): # FOR MMHC ALGORITHM -> Edge Restrictions if restriction is None or (u,v) in restriction: # SCORE FOR 'V' -> gaining a parent old_cols = (v,) + tuple(p_dict[v]) # without 'u' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = old_cols + (u,) # with'u' as parent mi_new = mutual_information(data[:,new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: if debug: print 'Improved Arc Addition: ' , (u,v) print 'Delta Score: ' , delta_score max_delta = delta_score max_operation = 'Addition' max_arc = (u,v) ### TEST ARC DELETIONS ### for u in bn.nodes(): for v in bn.nodes(): # CHECK TABU LIST - can't add back a deletion on the tabu list if (u,v,'Addition') not in tabu_list: if v in c_dict[u]: # SCORE FOR 'V' -> losing a parent old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:,new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: if debug: print 'Improved Arc Deletion: ' , (u,v) print 'Delta Score: ' , delta_score max_delta = delta_score max_operation = 'Deletion' max_arc = (u,v) ### TEST ARC REVERSALS ### for u in bn.nodes(): for v in bn.nodes(): # CHECK TABU LIST - can't reverse back a reversal on the tabu list if (u,v,'Reversal') not in tabu_list: if v in c_dict[u] and not would_cause_cycle(c_dict,v,u, reverse=True): # SCORE FOR 'U' -> gaining 'v' as parent old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = old_cols + (v,) # with 'v' as parent mi_new = mutual_information(data[:,new_cols]) delta1 = nrow * (mi_old - mi_new) # SCORE FOR 'V' -> losing 'u' as parent old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = tuple([u for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:,new_cols]) delta2 = nrow * (mi_old - mi_new) # COMBINED DELTA-SCORES delta_score = delta1 + delta2 if delta_score > max_delta: if debug: print 'Improved Arc Reversal: ' , (u,v) print 'Delta Score: ' , delta_score max_delta = delta_score max_operation = 'Reversal' max_arc = (u,v) ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ### if max_delta != 0: improvement = True u,v = max_arc if max_operation == 'Addition': if debug: print 'ADDING: ' , max_arc , '\n' c_dict[u].append(v) p_dict[v].append(u) tabu_list[_iter % 5] = (u,v,'Addition') elif max_operation == 'Deletion': if debug: print 'DELETING: ' , max_arc , '\n' c_dict[u].remove(v) p_dict[v].remove(u) tabu_list[_iter % 5] = (u,v,'Deletion') elif max_operation == 'Reversal': if debug: print 'REVERSING: ' , max_arc, '\n' c_dict[u].remove(v) p_dict[v].remove(u) c_dict[v].append(u) p_dict[u].append(v) tabu_list[_iter % 5] = (u,v,'Reversal') else: if debug: print 'No Improvement on Iter: ' , _iter ### TEST FOR MAX ITERATION ### _iter += 1 if _iter > max_iter: if debug: print 'Max Iteration Reached' break bn = BayesNet(c_dict) return bn
def iamb(data, alpha=0.05, feature_selection=None, debug=False): """ IAMB Algorithm for learning the structure of a Discrete Bayesian Network from data. Arguments --------- *data* : a nested numpy array *alpha* : a float The type II error rate. *feature_selection* : None or a string Whether to use IAMB as a structure learning or feature selection algorithm. Returns ------- *bn* : a BayesNet object or *mb* : the markov blanket of a node Effects ------- None Notes ----- - Works but there are definitely some bugs. Speed Test: *** 5 vars, 624 obs *** - 196 ms """ n_rv = data.shape[1] Mb = dict([(rv, []) for rv in range(n_rv)]) if feature_selection is None: _T = range(n_rv) else: assert (not isinstance(feature_selection, list) ), 'feature_selection must be only one value' _T = [feature_selection] # LEARN MARKOV BLANKET for T in _T: V = set(range(n_rv)) - {T} Mb_change = True # GROWING PHASE while Mb_change: Mb_change = False # find X_max in V-Mb(T)-{T} that maximizes # mutual information of X,T|Mb(T) # i.e. max of mi_test(data[:,(X,T,Mb(T))]) max_val = -1 max_x = None for X in V - set(Mb[T]) - {T}: cols = (X, T) + tuple(Mb[T]) mi_val = mi_test(data[:, cols], test=False) if mi_val > max_val: max_val = mi_val max_x = X # if Xmax is dependent on T given Mb(T) cols = (max_x, T) + tuple(Mb[T]) if max_x is not None and are_independent(data[:, cols]): Mb[T].append(X) Mb_change = True if debug: print('Adding %s to MB of %s' % (str(X), str(T))) # SHRINKING PHASE for X in Mb[T]: # if x is independent of t given Mb(T) - {x} cols = (X, T) + tuple(set(Mb[T]) - {X}) if are_independent(data[:, cols], alpha): Mb[T].remove(X) if debug: print('Removing %s from MB of %s' % (str(X), str(T))) if feature_selection is None: # RESOLVE GRAPH STRUCTURE edge_dict = resolve_markov_blanket(Mb, data) if debug: print('Unoriented edge dict:\n %s' % str(edge_dict)) print('MB: %s' % str(Mb)) # ORIENT EDGES oriented_edge_dict = orient_edges_gs2(edge_dict, Mb, data, alpha) if debug: print('Oriented edge dict:\n %s' % str(oriented_edge_dict)) # CREATE BAYESNET OBJECT value_dict = dict( zip(range(data.shape[1]), [list(np.unique(col)) for col in data.T])) bn = BayesNet(oriented_edge_dict, value_dict) return bn else: return Mb[_T]
def setUp(self): self.bn = BayesNet() self.dpath = os.path.join(dirname(dirname(dirname(dirname(__file__)))), 'data') self.bn_bif = read_bn(os.path.join(self.dpath, 'cancer.bif')) self.bn_bn = read_bn(os.path.join(self.dpath, 'cmu.bn'))
def hc(data, metric='AIC', max_iter=100, debug=False, restriction=None): """ Greedy Hill Climbing search proceeds by choosing the move which maximizes the increase in fitness of the network at the current step. It continues until it reaches a point where there does not exist any feasible single move that increases the network fitness. It is called "greedy" because it simply does what is best at the current iteration only, and thus does not look ahead to what may be better later on in the search. For computational saving, a Priority Queue (python's heapq) can be used to maintain the best operators and reduce the complexity of picking the best operator from O(n^2) to O(nlogn). This works by maintaining the heapq of operators sorted by their delta score, and each time a move is made, we only have to recompute the O(n) delta-scores which were affected by the move. The rest of the operator delta-scores are not affected. For additional computational efficiency, we can cache the sufficient statistics for various families of distributions - therefore, computing the mutual information for a given family only needs to happen once. The possible moves are the following: - add edge - delete edge - invert edge Arguments --------- *data* : a nested numpy array The data from which the Bayesian network structure will be learned. *metric* : a string Which score metric to use. Options: - AIC - BIC / MDL - LL (log-likelihood) *max_iter* : an integer The maximum number of iterations of the hill-climbing algorithm to run. Note that the algorithm will terminate on its own if no improvement is made in a given iteration. *debug* : boolean Whether to print(the scores/moves of the) algorithm as its happening. *restriction* : a list of 2-tuples For MMHC algorithm, the list of allowable edge additions. Returns ------- *bn* : a BayesNet object """ nrow = data.shape[0] ncol = data.shape[1] names = range(ncol) # INITIALIZE NETWORK W/ NO EDGES # maintain children and parents dict for fast lookups c_dict = dict([(n, []) for n in names]) p_dict = dict([(n, []) for n in names]) # COMPUTE INITIAL LIKELIHOOD SCORE value_dict = dict([(n, np.unique(data[:, i])) for i,n in enumerate(names)]) bn = BayesNet(c_dict) mle_estimator(bn, data) max_score = info_score(bn, nrow, metric) # CREATE EMPIRICAL DISTRIBUTION OBJECT FOR CACHING #ED = EmpiricalDistribution(data,names) _iter = 0 improvement = True while improvement: improvement = False max_delta = 0 if debug: print('ITERATION: ', _iter) ### TEST ARC ADDITIONS ### for u in bn.nodes(): for v in bn.nodes(): if v not in c_dict[u] and u != v and not would_cause_cycle(c_dict, u, v): # FOR MMHC ALGORITHM -> Edge Restrictions if restriction is None or (u, v) in restriction: # SCORE FOR 'V' -> gaining a parent old_cols = (v,) + tuple(p_dict[v]) # without 'u' as parent mi_old = mutual_information(data[:, old_cols]) new_cols = old_cols + (u,) # with'u' as parent mi_new = mutual_information(data[:, new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: #if debug: # print('Improved Arc Addition: ' , (u,v)) # print('Delta Score: ' , delta_score) max_delta = delta_score max_operation = 'Addition' max_arc = (u,v) ### TEST ARC DELETIONS ### for u in bn.nodes(): for v in bn.nodes(): if v in c_dict[u]: # SCORE FOR 'V' -> losing a parent old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = tuple([i for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:,new_cols]) delta_score = nrow * (mi_old - mi_new) if delta_score > max_delta: #if debug: # print('Improved Arc Deletion: ' , (u,v)) # print('Delta Score: ' , delta_score) max_delta = delta_score max_operation = 'Deletion' max_arc = (u,v) ### TEST ARC REVERSALS ### for u in bn.nodes(): for v in bn.nodes(): if v in c_dict[u] and not would_cause_cycle(c_dict,v,u, reverse=True): # SCORE FOR 'U' -> gaining 'v' as parent old_cols = (u,) + tuple(p_dict[v]) # without 'v' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = old_cols + (v,) # with 'v' as parent mi_new = mutual_information(data[:,new_cols]) delta1 = nrow * (mi_old - mi_new) # SCORE FOR 'V' -> losing 'u' as parent old_cols = (v,) + tuple(p_dict[v]) # with 'u' as parent mi_old = mutual_information(data[:,old_cols]) new_cols = tuple([u for i in old_cols if i != u]) # without 'u' as parent mi_new = mutual_information(data[:,new_cols]) delta2 = nrow * (mi_old - mi_new) # COMBINED DELTA-SCORES delta_score = delta1 + delta2 if delta_score > max_delta: #if debug: # print('Improved Arc Reversal: ' , (u,v)) # print('Delta Score: ' , delta_score) max_delta = delta_score max_operation = 'Reversal' max_arc = (u,v) ### DETERMINE IF/WHERE IMPROVEMENT WAS MADE ### if max_delta != 0: improvement = True u,v = max_arc if max_operation == 'Addition': if debug: print('ADDING: ' , max_arc , '\n') c_dict[u].append(v) p_dict[v].append(u) elif max_operation == 'Deletion': if debug: print('DELETING: ' , max_arc , '\n') c_dict[u].remove(v) p_dict[v].remove(u) elif max_operation == 'Reversal': if debug: print('REVERSING: ' , max_arc, '\n') c_dict[u].remove(v) p_dict[v].remove(u) c_dict[v].append(u) p_dict[u].append(v) else: if debug: print('No Improvement on Iter: ' , _iter) ### TEST FOR MAX ITERATION ### _iter += 1 if _iter > max_iter: if debug: print('Max Iteration Reached') break # bn = BayesNet(c_dict) # print("bn is: " + str(bn.E)) return c_dict
def read_json(path): """ Read a BayesNet object from the json format. This format has the ".bn" extension and is completely unique to pyBN. Arguments --------- *path* : a string The file path Returns ------- None Effects ------- - Instantiates and sets a new BayesNet object Notes ----- This function reads in a libpgm-style format into a bn object File Format: { "V": ["Letter", "Grade", "Intelligence", "SAT", "Difficulty"], "E": [["Intelligence", "Grade"], ["Difficulty", "Grade"], ["Intelligence", "SAT"], ["Grade", "Letter"]], "Vdata": { "Letter": { "ord": 4, "numoutcomes": 2, "vals": ["weak", "strong"], "parents": ["Grade"], "children": None, "cprob": [[.1, .9],[.4, .6],[.99, .01]] }, ... } """ def byteify(input): if isinstance(input, dict): return { byteify(key): byteify(value) for key, value in input.iteritems() } elif isinstance(input, list): return [byteify(element) for element in input] elif isinstance(input, unicode): return input.encode('utf-8') else: return input bn = BayesNet() f = open(path, 'r') ftxt = f.read() success = False try: data = byteify(json.loads(ftxt)) bn.V = data['V'] bn.E = data['E'] bn.F = data['F'] success = True except ValueError: print("Could not read file - check format") bn.V = topsort(bn.E) return bn
def read_bif(path): """ This function reads a .bif file into a BayesNet object. It's probably not the fastest or prettiest but it gets the job done. Arguments --------- *path* : a string The path Returns ------- *bn* : a BayesNet object Effects ------- None Notes ----- *V* : a list of strings *E* : a dict, where key = vertex, val = list of its children *F* : a dict, where key = rv, val = another dict with keys = 'parents', 'values', cpt' """ _parents = {} # key = vertex, value = list of vertices in the scope (includind itself) _cpt = {} # key = vertex, value = list (then numpy array) _vals = {} # key=vertex, val=list of its possible values with open(path, 'r') as f: while True: line = f.readline() if 'variable' in line: new_vertex = line.split()[1] _parents[new_vertex] = [] _cpt[new_vertex] = [] #_vals[new_vertex] = [] new_line = f.readline() new_vals = new_line.replace(',', ' ').split()[6:-1] # list of vals _vals[new_vertex] = new_vals num_outcomes = len(new_vals) elif 'probability' in line: line = line.replace(',', ' ') child_rv = line.split()[2] parent_rvs = line.split()[4:-2] if len(parent_rvs) == 0: # prior new_line = f.readline().replace(';', ' ').replace(',',' ').split() prob_values = new_line[1:] _cpt[child_rv].append(map(float,prob_values)) #_cpt[child_rv] = map(float,prob_values) else: # not a prior _parents[child_rv].extend(list(parent_rvs)) while True: new_line = f.readline() if '}' in new_line: break new_line = new_line.replace(',',' ').replace(';',' ').replace('(', ' ').replace(')', ' ').split() prob_values = new_line[-(len(_vals[new_vertex])):] prob_values = map(float,prob_values) _cpt[child_rv].append(prob_values) if line == '': break # CREATE FACTORS _F = {} _E = {} for rv in _vals.keys(): _E[rv] = [c for c in _vals.keys() if rv in _parents[c]] f = { 'parents' : _parents[rv], 'values' : _vals[rv], 'cpt' : [item for sublist in _cpt[rv] for item in sublist] } _F[rv] = f bn = BayesNet() bn.F = _F bn.E = _E bn.V = list(topsort(_E)) return bn
def train_model(data: np.ndarray, clusters: int = 5, init_nodes: list = None) -> BayesianNetwork: bn = BayesNet() #Сluster the initial data in order to fill in a hidden variable based on the distribution of clusters kmeans = KMeans(n_clusters=clusters, random_state=0).fit(data) labels = kmeans.labels_ hidden_dist = DiscreteDistribution.from_samples(labels) hidden_var = np.array(hidden_dist.sample(data.shape[0])) new_data = np.column_stack((data, hidden_var)) latent = (new_data.shape[1]) - 1 #Train the network structure on data taking into account a hidden variable bn = hc_rr(new_data, latent=latent, init_nodes=init_nodes) structure = [] nodes = sorted(list(bn.nodes())) for rv in nodes: structure.append(tuple(bn.F[rv]['parents'])) structure = tuple(structure) bn = BayesianNetwork.from_structure(new_data, structure) bn.bake() #Learn a hidden variable hidden_var = np.array([np.nan] * (data.shape[0])) new_data = np.column_stack((data, hidden_var)) bn.predict(new_data) bn.fit(new_data) bn.bake() return (bn)