def train_model(data: np.ndarray, clusters: int = 5, init_nodes: list = None) -> BayesianNetwork: bn = BayesNet() #Сluster the initial data in order to fill in a hidden variable based on the distribution of clusters kmeans = KMeans(n_clusters=clusters, random_state=0).fit(data) labels = kmeans.labels_ hidden_dist = DiscreteDistribution.from_samples(labels) hidden_var = np.array(hidden_dist.sample(data.shape[0])) new_data = np.column_stack((data, hidden_var)) latent = (new_data.shape[1]) - 1 #Train the network structure on data taking into account a hidden variable bn = hc_rr(new_data, latent=latent, init_nodes=init_nodes) structure = [] nodes = sorted(list(bn.nodes())) for rv in nodes: structure.append(tuple(bn.F[rv]['parents'])) structure = tuple(structure) bn = BayesianNetwork.from_structure(new_data, structure) bn.bake() #Learn a hidden variable hidden_var = np.array([np.nan] * (data.shape[0])) new_data = np.column_stack((data, hidden_var)) bn.predict(new_data) bn.fit(new_data) bn.bake() return (bn)
def mmhc(data, alpha=0.05, metric='AIC', max_iter=100, method='hc'): """ Max-Min Hill Climbing Algorithm for learning a Bayesian Network structure from data. Arguments --------- *data* : a numpy ndarray *alpha* : a float Probability of Type II Error for independence tests. *metric* : a string *metric* : a string Which score metric to use. Options: - 'AIC' - 'BIC' - 'LL' (log-likelihood) *method* : a string The type of hill-climbing algorithm to run OPTIONS: - 'hc' : normal hill-climbing - 'rr' : hill-climbing with random restarts - 'tabu' : tabu hill-climbing Returns ------- *bn* : a BayesNet object """ # GET EDGE RESTRICTIONS FROM MMPC PC_dict = mmpc(data) restriction = [] for y, pc in PC_dict.items(): for x in pc: restriction.append((y, x)) # RUN HILL-CLIMBING WITH EDGE RESTRICTIONS if method == 'tabu': bn = tabu(data=data, metric=metric, max_iter=max_iter, restriction=restriction) elif method == 'rr': bn = hc_rr(data=data, metric=metric, max_iter=max_iter, restriction=restriction) else: bn = hc(data=data, metric=metric, max_iter=max_iter, restriction=restriction) return bn
def mdbn(data, f_cols, c_cols, f_struct='DAG', c_struct='DAG', wrapper=False): """ Learn the structure of a Multi-Dimensional Bayesian Network - typically used for Classification. Note that this structure does not have to be used for classification, since it simply returns a Bayesian Network - albeit with a more unqiue structure than tradiitonally found. If there are any other applications of this bipartite-like BN structure learning, this algorithm can certainly be used. """ f_data = data[:,f_cols] c_data = data[:,c_cols] f_bn = hc_rr(f_data) c_bn = hc_rr(c_data) mbc_bn = bridge(c_bn=c_bn, f_bn=f_bn, data=data) return mbc_bn
def bridge(c_bn, f_bn, data): """ Make a Multi-Dimensional Bayesian Network by bridging two Bayesian network structures. This happens by placing edges from c_bn -> f_bn using a heuristic optimization procedure. This can be used to create a Multi-Dimensional Bayesian Network classifier from two already-learned Bayesian networks - one of which is a BN containing all the class variables, the other containing all the feature variables. Arguments --------- *c_bn* : a BayesNet object with known structure *f_bn* : a BayesNet object with known structure. Returns ------- *m_bn* : a merged/bridge BayesNet object, whose structure contains *c_bn*, *f_bn*, and some bridge edges between them. """ restrict = [] for u in c_bn: for v in f_bn: restrict.append((u,v)) # only allow edges from c_bn -> f_bn bridge_bn = hc_rr(data, restriction=restrict) m_bn = bridge_bn.E m_bn.update(c_bn.E) m_bn.update(f_bn.E) mbc_bn = BayesNet(E=m_bn)