def wrapper(bn: HybridBN, test: pd.DataFrame, columns: List[str]): preds = {column_name: list() for column_name in columns} if len(test) == 1: for i in range(test.shape[0]): test_row = dict(test.iloc[i, :]) for n, key in enumerate(columns): try: sample = bn.sample(1, evidence=test_row, predict=True) if bn.descriptor['types'][key] == 'cont': if (bn.descriptor['signs'][key] == 'pos') & (sample.loc[0, key] < 0): # preds[key].append(np.nan) preds[key].append(0) else: preds[key].append(sample.loc[0, key]) else: preds[key].append(sample.loc[0, key]) except Exception as ex: logger_network.error(ex) preds[key].append(np.nan) return preds else: logger_network.error( 'Wrapper for one row from pandas.DataFrame') return {}
def set_structure(self, info: Optional[Dict] = None, nodes: Optional[List] = None, edges: Optional[List[Sequence[str]]] = None, overwrite: bool = True): """ Function to set structure manually info: Descriptor nodes, edges: overwrite: use 2 stage of defining or not """ if nodes and (info or (self.descriptor["types"] and self.descriptor["signs"])): self.set_nodes(nodes=nodes, info=info) if edges: self.set_edges(edges=edges) if overwrite: builder = Builders.VerticesDefiner( descriptor=self.descriptor) # init worker builder.skeleton['V'] = builder.vertices # 1 stage builder.skeleton['E'] = self.edges builder.get_family() if self.edges: builder.overwrite_vertex(has_logit=self.has_logit, use_mixture=self.use_mixture) self.set_nodes(nodes=builder.skeleton['V']) else: logger_network.error("Empty set of edges")
def set_parameters(self, parameters: Dict): if not self.nodes: logger_network.error("Failed on search of BN's nodes.") # elif self._param_validation(parameters): # pass self.distributions = parameters
def add_edges(self, data: pd.DataFrame, scoring_function: Union[Tuple[str, Callable], Tuple[str]], classifier: Optional[object] = None, params: Optional[ParamDict] = None, optimizer: str = 'HC'): """ Base function for Structure learning scoring_function: tuple with following format (NAME, scoring_function) or (NAME,) Params: init_edges: list of tuples, a graph to start learning with remove_init_edges: allows changes in model defined by user white_list: list of allowed edges """ if not self.has_logit and classifier: logger_network.error( "Classifiers dict will be ignored since logit nodes are forbidden." ) return None # params validation if params: # init_edges validation if not self.has_logit and "init_edges" in params.keys(): type_map = np.array([[ self.descriptor["types"][node1], self.descriptor["types"][node2] ] for node1, node2 in params["init_edges"]]) failed = ((type_map[:, 0] == "cont") & ((type_map[:, 1] == "disc") | (type_map[:, 1] == "disc_num"))) if sum(failed): logger_network.warning( f"Edges between continuous nodes and disc nodes are forbidden (has_logit = {self.has_logit}), " f"they will be ignored. Indexes: {np.where(failed)[0]}" ) for index in np.where(failed)[0]: del params["init_edges"][index] if not self.validate(descriptor=self.descriptor): logger_network.error( f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data" ) return None if optimizer == 'HC': worker = Builders.HCStructureBuilder( data=data, descriptor=self.descriptor, scoring_function=scoring_function, has_logit=self.has_logit, use_mixture=self.use_mixture) self.sf_name = scoring_function[0] worker.build(data=data, params=params, classifier=classifier) # update family self.nodes = worker.skeleton['V'] self.edges = worker.skeleton['E']
def _param_validation(self, params: Dict[str, Any]) -> bool: if all(self[i] for i in params.keys()): for name, info in params.items(): try: self[name].choose(node_info=info, pvals=[]) except Exception as ex: logger_network.error("Validation failed", exc_info=ex) return False return True else: logger_network.error( "Param validation failed due to unknown nodes.") return False
def set_classifiers(self, classifiers: Dict[str, object]): """ Set classifiers for logit nodes. classifiers: dict with node_name and Classifier """ if not self.has_logit: logger_network.error("Logit nodes are forbidden.") return None for node in self.nodes: if "Logit" in node.type: if node.name in classifiers.keys(): node.classifier = classifiers[node.name] node.type = re.sub(r"\([\s\S]*\)", f"({type(node.classifier).__name__})", node.type) else: continue
def set_nodes(self, nodes: List, info: Optional[Dict] = None): """ additional function to set nodes manually. User should be aware that nodes must be a subclass of BaseNode. :param nodes dict with name and node (if a lot of nodes should be added) """ if not info and not self.descriptor["types"]: logger_network.error( "In case of manual setting nodes user should set map for them as well." ) return self.nodes = [] for node in nodes: try: assert issubclass(type(node), Nodes.BaseNode) self.nodes.append(node) continue except AssertionError: logger_network.error( f"{node} is not an instance of {Nodes.BaseNode}") continue except TypeError: logger_network.error(f"TypeError : {node.__class__}") continue if info: self.descriptor = info
def add_nodes(self, descriptor: Dict[str, Dict[str, str]]): """ Function for initializing nodes in Bayesian Network descriptor: dict with types and signs of nodes """ if not self.validate(descriptor=descriptor): if not self.type == 'Hybrid': logger_network.error( f"{self.type} BN does not support {'discrete' if self.type == 'Continuous' else 'continuous'} data" ) return else: logger_network.error( f"Descriptor validation failed due to wrong type of column(s)." ) return elif ['Abstract'] in self._allowed_dtypes: return None self.descriptor = descriptor # LEVEL 1 worker_1 = Builders.VerticesDefiner(descriptor) self.nodes = worker_1.vertices
def calculate_weights(self, discretized_data: pd.DataFrame): """ Provide calculation of link strength according mutual information between node and its parent(-s) values. """ import bamt.utils.GraphUtils as gru if not all([ i in ['disc', 'disc_num'] for i in gru.nodes_types(discretized_data).values() ]): logger_network.error( f"calculate_weghts() method deals only with discrete data. Continuous data: " + f"{[col for col, type in gru.nodes_types(discretized_data).items() if type not in ['disc', 'disc_num']]}" ) if not self.edges: logger_network.error( "Bayesian Network hasn't fitted yet. Please add edges with add_edges() method" ) if not self.nodes: logger_network.error( "Bayesian Network hasn't fitted yet. Please add nodes with add_nodes() method" ) weights = dict() for node in self.nodes: parents = node.cont_parents + node.disc_parents if parents is None: continue y = discretized_data[node.name].values if len(parents) == 1: x = discretized_data[parents[0]].values LS_true = drv.information_mutual(X=y, Y=x) entropy = drv.entropy(X=y) weight = LS_true / entropy weights[(parents[0], node.name)] = weight else: for parent_node in parents: x = discretized_data[parent_node].values other_parents = [ tmp for tmp in parents if tmp != parent_node ] z = list() for other_parent in other_parents: z.append(list(discretized_data[other_parent].values)) LS_true = np.average( drv.information_mutual_conditional( X=y, Y=x, Z=z, cartesian_product=True)) entropy = np.average( drv.entropy_conditional( X=y, Y=z, cartesian_product=True)) + 1e-8 weight = LS_true / entropy weights[(parent_node, node.name)] = weight self.weights = weights
def set_edges(self, edges: Optional[List[Sequence[str]]] = None): """ additional function to set edges manually. User should be aware that nodes must be a subclass of BaseNode. :param edges dict with name and node (if a lot of nodes should be added) """ if not self.nodes: logger_network.error("Graph without nodes") self.edges = [] for node1, node2 in edges: if isinstance(node1, str) and isinstance(node2, str): if self[node1] and self[node2]: self.edges.append((node1, node2)) else: logger_network.error(f"Unknown Nodes : [{node1}, {node2}]") continue else: logger_network.error( f"Unknown node(s) type: [{node1.__class__}, {node2.__class__}]" ) continue self.update_descriptor()
def plot(self, output: str): """ Visualize a Bayesian Network. Result will be saved in parent directory in folder visualization_result. output: str name of output file """ if not output.endswith('.html'): logger_network.error("This version allows only html format.") return None G = nx.DiGraph() nodes = [node.name for node in self.nodes] G.add_nodes_from(nodes) G.add_edges_from(self.edges) network = Network(height="800px", width="100%", notebook=True, directed=nx.is_directed(G), layout='hierarchical') nodes_sorted = np.array(list(nx.topological_generations(G)), dtype=object) # Qualitative class of colormaps q_classes = [ 'Pastel1', 'Pastel2', 'Paired', 'Accent', 'Dark2', 'Set1', 'Set2', 'Set3', 'tab10', 'tab20', 'tab20b', 'tab20c' ] hex_colors = [] for cls in q_classes: rgb_colors = plt.get_cmap(cls).colors hex_colors.extend([ matplotlib.colors.rgb2hex(rgb_color) for rgb_color in rgb_colors ]) hex_colors = np.array(hex_colors) # Number_of_colors in matplotlib in Qualitative class = 144 class_number = len(set([node.type for node in self.nodes])) hex_colors_indexes = [ random.randint(0, len(hex_colors) - 1) for _ in range(class_number) ] hex_colors_picked = hex_colors[hex_colors_indexes] class2color = { cls: color for cls, color in zip(set([node.type for node in self.nodes]), hex_colors_picked) } name2class = {node.name: node.type for node in self.nodes} for level in range(len(nodes_sorted)): for node_i in range(len(nodes_sorted[level])): name = nodes_sorted[level][node_i] cls = name2class[name] color = class2color[cls] network.add_node(name, label=name, color=color, size=45, level=level, font={'size': 36}, title=f'Узел байесовской сети {name} ({cls})') for edge in G.edges: network.add_edge(edge[0], edge[1]) network.hrepulsion(node_distance=300, central_gravity=0.5) if not (os.path.exists('visualization_result')): os.mkdir("visualization_result") return network.show(f'visualization_result/' + output)
def predict( self, test: pd.DataFrame, parall_count: int = 1 ) -> Dict[str, Union[List[str], List[int], List[float]]]: """ Function to predict columns from given data. Note that train data and test data must have different columns. Both train and test datasets must be cleaned from NaNs. Args: test (pd.DataFrame): test dataset parall_count (int, optional):number of threads. Defaults to 1. Returns: predicted data (dict): dict with column as key and predicted data as value """ if test.isnull().any().any(): logger_network.error("Test data contains NaN values.") return {} from joblib import Parallel, delayed def wrapper(bn: HybridBN, test: pd.DataFrame, columns: List[str]): preds = {column_name: list() for column_name in columns} if len(test) == 1: for i in range(test.shape[0]): test_row = dict(test.iloc[i, :]) for n, key in enumerate(columns): try: sample = bn.sample(1, evidence=test_row, predict=True) if bn.descriptor['types'][key] == 'cont': if (bn.descriptor['signs'][key] == 'pos') & (sample.loc[0, key] < 0): # preds[key].append(np.nan) preds[key].append(0) else: preds[key].append(sample.loc[0, key]) else: preds[key].append(sample.loc[0, key]) except Exception as ex: logger_network.error(ex) preds[key].append(np.nan) return preds else: logger_network.error( 'Wrapper for one row from pandas.DataFrame') return {} columns = list(set(self.nodes_names) - set(test.columns.to_list())) if not columns: logger_network.error("Test data is the same as train.") return {} preds = {column_name: list() for column_name in columns} processed_list = Parallel(n_jobs=parall_count)( delayed(wrapper)(self, test.loc[[i]], columns) for i in tqdm(test.index, position=0, leave=True)) for i in range(test.shape[0]): curr_pred = processed_list[i] for n, key in enumerate(columns): preds[key].append(curr_pred[key][0]) # for column in columns: # preds[column] = [k for k in preds[column] if not pd.isna(k)] return preds
def sample(self, n: int, evidence: Optional[Dict[str, Union[str, int, float]]] = None, as_df: bool = True, predict: bool = False, parall_count: int = 1) -> \ Union[None, pd.DataFrame, List[Dict[str, Union[str, int, float]]]]: """ Sampling from Bayesian Network n: int number of samples evidence: values for nodes from user parall_count: number of threads. Defaults to 1. """ from joblib import Parallel, delayed random.seed() if not self.distributions.items(): logger_network.error( "Parameter learning wasn't done. Call fit_parameters method") return None if evidence: for node in self.nodes: if (node.type == 'Discrete') & (node.name in evidence.keys()): if not (isinstance(evidence[node.name], str)): evidence[node.name] = str(int(evidence[node.name])) def wrapper(): output = {} for node in self.nodes: parents = node.cont_parents + node.disc_parents if evidence and node.name in evidence.keys(): output[node.name] = evidence[node.name] else: if not parents: pvals = None else: if self.type == 'Discrete': pvals = [str(output[t]) for t in parents] else: pvals = [output[t] for t in parents] # If any nan from parents, sampling from node blocked. if any(pd.isnull(pvalue) for pvalue in pvals): output[node.name] = np.nan continue if predict: output[node.name] = \ node.predict( self.distributions[node.name], pvals=pvals) else: output[node.name] = \ node.choose( self.distributions[node.name], pvals=pvals) return output seq = Parallel(n_jobs=parall_count)( delayed(wrapper)() for i in tqdm(range(n), position=0, leave=True)) seq_df = pd.DataFrame.from_dict(seq, orient='columns') seq_df.dropna(inplace=True) cont_nodes = [ c.name for c in self.nodes if c.type != 'Discrete' and 'Logit' not in c.type ] positive_columns = [ c for c in cont_nodes if self.descriptor['signs'][c] == 'pos' ] seq_df = seq_df[(seq_df[positive_columns] >= 0).all(axis=1)] seq_df.reset_index(inplace=True, drop=True) seq = seq_df.to_dict('records') if as_df: return pd.DataFrame.from_dict(seq, orient='columns') else: return seq