def get_node_index(cls, graph, X, **kwargs): index = dict_utils.get(kwargs, 'index', list(range(X.shape[0]))) X = utils.make_ndarray(X) index = utils.make_ndarray(index, shape=-1) if X.shape[0] != index.shape[0]: raise ValueError('Mismatch data and index shape') result = {} root = 0 verify = dict_utils.get(kwargs, 'verify', True) feature_data = graph.get_data(FEATURE_DATA_KEY) n_childes = graph.n_child() def recurse(node, index): result[node] = index if n_childes[node] == 2: left_index, right_index = cls.get_child_index( X[index, ], index, feature_data[node]) recurse(graph.df.at[node, 'child_left'], left_index) recurse(graph.df.at[node, 'child_right'], right_index) recurse(root, index) if verify: leaf_nodes = np.where(n_childes == 0)[0] node_index = dict_utils.subset_dict(result, leaf_nodes) result_array = np.concatenate(list(node_index.values())) unique, counts = np.unique(result_array, return_counts=True) if np.any(counts != 1): raise ValueError('Index in multiple leaf') if unique.shape[0] != index.shape[0]: raise ValueError('Missing index in result') return result
def _set_color_scalar(self, score, **kwargs): if not np.all(np.isnan(score)): normalize_bound = dict_utils.get(kwargs, 'normalize_bound', lambda x: (np.nanmin(x), np.nanmax(x))) reverse_cmap = dict_utils.get(kwargs, 'reverse_cmap', False) if callable(normalize_bound): normalize_bound = normalize_bound(score) self.color = self.default_color_map_scalar(score, normalize_bound, reverse_cmap) self.filled = True
def get_graph_info(tree, tree_dump, tree_iter, kwargs): features_name = dict_utils.get(kwargs, 'features_name') class_name = dict_utils.get(kwargs, 'class_name') graph_info = {'_fitted': True, 'objective': kwargs['objective'], 'n_class': kwargs['n_class'], 'class_name': class_name, 'n_features': kwargs['n_features'], 'features_name': features_name, 'score_data': {'pred_score_key': None, 'color_score_key': None}} return graph_info
def has_split(df, node=None): if node is not None: split = dict_utils.get(df.at[node, 'data'], key=FEATURE_DATA_KEY) if split is None or split.is_none(): return False splits = BiGraphDF.get_data(df, keys=FEATURE_DATA_KEY) return splits.apply(lambda x: x is not None and x.is_split())
def compare_fit(cls, graph, X_test, y_test, **kwargs): score_handler = dict_utils.get(kwargs, 'score_handler', _default_compare_score_handler) if not graph.is_fitted(): raise NotFittedError try: population_cal_field = graph.score_data[ graph.score_data['population_cal_field_key']] except: population_cal_field = {} kwargs['fit_population_cal_field'] = population_cal_field kwargs['fit_score_dict'] = graph.get_score().to_dict() kwargs['score_handler'] = score_handler actual_population_cal_field, scores = cls._fit(graph, X_test, y_test, **kwargs) score_data = { 'pred_score_key': score_handler.pred_score_key, 'color_score_key': score_handler.color_score_key, 'population_cal_field_key': score_handler.population_cal_field_key, score_handler.population_cal_field_key: population_cal_field, 'actual_population_cal_field_key': actual_population_cal_field } return graph.update_graph(score=scores, new_graph_info={'score_data': score_data}, inplace=False)
def to_networkx(self, **kwargs): max_depth = dict_utils.get(kwargs, 'max_depth', np.Inf) NetworkxHelperClass = dict_utils.get(kwargs, 'nx_helper', NetworkxHelper) if max_depth < 0: raise ValueError("max depth should be non negative") nx_helper = NetworkxHelperClass(self, **kwargs) graph = nx.DiGraph(node={ 'color': 'black', 'fontname': 'helvetica', 'shape': 'box', 'style': 'filled, ' * nx_helper.filled + 'rounded' }, edge={'fontname': 'helvetica'}, graph_info=nx_helper.graph_info) def add_node(node, parent, depth): if depth > max_depth: return if nx_helper.filled and (not pd.isnull(nx_helper.color[node])): graph.add_node(node, type=self.df.at[node, 'type'], data=self.df.at[node, 'data'], label=nx_helper.labels[node], fillcolor=nx_helper.color[node]) else: graph.add_node(node, type=self.df.at[node, 'type'], data=self.df.at[node, 'data'], label=nx_helper.labels[node]) if depth > 1: graph.add_edge(parent, node) elif depth == 1: graph.add_edge(parent, node, label=self.df.at[node, 'type'] == -1) if self.n_child(node) == 2: add_node(self.df.at[node, 'child_left'], node, depth + 1) add_node(self.df.at[node, 'child_right'], node, depth + 1) add_node(0, ROOT_PARENT, 0) return graph
def default_compare_score_fn(graph, node, y, population_cal_field, score_dict, fit_population_cal_field, fit_score_dict, **kwargs): pred_score_key = graph.score_data['pred_score_key'] return { 'train_y': dict_utils.get(fit_score_dict[node], pred_score_key, np.nan), 'actual_y': np.mean(y) }
def get_child(tree_iter, node, kwargs): edges = tree_iter.edges(node) if len(edges) == 0: return if len(edges) != 2: raise ValueError('Invalid BiGraph') (start1, end1), (start2, end2) = edges if start1 != node or start2 != node: raise ValueError('Invalid BiGraph') type1 = dict_utils.get(tree_iter.nodes[end1], 'type') type2 = dict_utils.get(tree_iter.nodes[end2], 'type') if type1 not in [-1, 1]: type1 = None if type1 == 1 or (type1 != -1 and type2 == -1): # left != end1 end1, end2 = end2, end1 # Now end1 become left return tree_iter, end1, tree_iter, end2
def __init__(self, graph, **kwargs): self.show_id = dict_utils.get(kwargs, 'show_id', None) self.decimals = dict_utils.get(kwargs, 'decimals', 4) graph_info = graph.get_graph_info(exclude_df=True) score_data = dict_utils.get(graph_info, 'score_data') self.graph_info = graph_info self.features_name = dict_utils.get_first([kwargs, graph_info], 'features_name') self.pred_score_key = dict_utils.get_first([kwargs, score_data], 'pred_score_key') self.color_score_key = dict_utils.get_first([kwargs, score_data], 'color_score_key') self.objective = dict_utils.get_first([kwargs, graph_info], 'objective') self.n_class = dict_utils.get_first([kwargs, graph_info], 'n_class') self.fitted = dict_utils.get_first([kwargs, graph_info], '_fitted') keys = dict_utils.get(kwargs, 'keys', None) self.data_keys = OrderedSet(dict_utils.get(kwargs, 'data_keys', keys)) self.score_keys = OrderedSet(dict_utils.get(kwargs, 'score_keys', keys)) if FEATURE_DATA_KEY not in self.data_keys: self.data_keys = OrderedSet((FEATURE_DATA_KEY,)) | self.data_keys if self.pred_score_key is not None and self.pred_score_key not in self.score_keys: self.score_keys = self.score_keys | OrderedSet((self.pred_score_key,)) self.filled = False self.color = None self.labels = None self.execute_function(graph, **kwargs)
def from_custom_extractor(extractor, tree, **kwargs): """ :param extractor: Custom graph extractor that return df, graph_info :param tree: first input of extractor :param kwargs: additional kwargs of extractor keys used as input of BiGraph init: verify (default False), reindex (default False), copy_df (default False), copy_graph (default False), copy (default None) :return: BiGraph """ verify = dict_utils.get(kwargs, 'verify', False) reindex = dict_utils.get(kwargs, 'reindex', False) copy_df = dict_utils.get(kwargs, 'copy_df', False) copy_graph = dict_utils.get(kwargs, 'copy_graph', False) copy = dict_utils.get(kwargs, 'copy', None) df, graph_info = extractor.extract_graph(tree, kwargs=kwargs) return BiGraph(df=df, graph_info=graph_info, verify=verify, reindex=reindex, copy_df=copy_df, copy_graph=copy_graph, copy=copy)
def extract_graph(cls, tree, kwargs=None): kwargs = kwargs if kwargs is not None else {} cls.verify(tree, kwargs) cls.set_defaults(tree, kwargs) copy_df = dict_utils.get(kwargs, 'copy_df', False) tree_dump = cls.get_dump(tree, kwargs) tree_iter, root_node = cls.get_iterator(tree_dump, kwargs) root_is_leaf = not cls.has_child(tree_iter, root_node, kwargs) root_data = cls.get_data(tree_iter, root_node, is_leaf=root_is_leaf, kwargs=kwargs) root_score = cls.get_score(tree_iter, root_node, is_leaf=root_is_leaf, kwargs=kwargs) root = dict(zip(GRAPH_COL, [0, ROOT_PARENT, TREE_LEAF, TREE_LEAF, TYPE_ROOT, root_data, root_score])) row_list = [root] # noinspection PyShadowingNames def recurse(tree_iter, parent, parent_node, depth): # parent: internal node id; parent_node: tree node id if not cls.has_child(tree_iter, parent_node, kwargs): return left_tree_iter, left_node, right_tree_iter, right_node = cls.get_child(tree_iter, parent_node, kwargs) left_is_leaf = not cls.has_child(left_tree_iter, left_node, kwargs) right_is_leaf = not cls.has_child(right_tree_iter, right_node, kwargs) left_data = cls.get_data(left_tree_iter, left_node, is_leaf=left_is_leaf, kwargs=kwargs) left_score = cls.get_score(left_tree_iter, left_node, is_leaf=left_is_leaf, kwargs=kwargs) right_data = cls.get_data(right_tree_iter, right_node, is_leaf=right_is_leaf, kwargs=kwargs) right_score = cls.get_score(right_tree_iter, right_node, is_leaf=right_is_leaf, kwargs=kwargs) left = dict(zip(GRAPH_COL, [depth, parent, TREE_LEAF, TREE_LEAF, TYPE_LEFT, left_data, left_score])) right = dict(zip(GRAPH_COL, [depth, parent, TREE_LEAF, TREE_LEAF, TYPE_RIGHT, right_data, right_score])) left_id = len(row_list) right_id = left_id + 1 row_list.append(left) row_list.append(right) parent = row_list[parent] parent.update({'child_left': left_id, 'child_right': right_id}) recurse(left_tree_iter, left_id, left_node, depth + 1) recurse(right_tree_iter, right_id, right_node, depth + 1) recurse(tree_iter, parent=0, parent_node=root_node, depth=1) graph_info = cls.get_graph_info(tree, tree_dump, tree_iter, kwargs) graph_info = graph_info if graph_info is not None else {} df = pd.DataFrame(row_list, columns=GRAPH_COL) return df, graph_info
def get_graph_info(tree, tree_dump, tree_iter, kwargs): if kwargs['objective'] == 'classification': n_class = tree.n_classes_ class_name = tree.classes_ else: n_class = None class_name = None n_features = tree.n_features_ features_name = dict_utils.get(kwargs, 'features_name') graph_info = {'_fitted': True, 'objective': kwargs['objective'], 'n_class': n_class, 'class_name': class_name, 'n_features': n_features, 'features_name': features_name, 'score_data': {'pred_score_key': kwargs['pred_score_key'], 'color_score_key': kwargs['color_score_key']}} return graph_info
def get_score(df, keys=None, default=None, order_dict=False, missing='ignore'): if keys is None: return df['score'] keys = utils.flatten_list(keys) if len(keys) == 1: series = df['score'].apply( lambda x: dict_utils.get(x, keys[0], default)) else: series = df['score'].apply(lambda x: dict_utils.subset_dict( x, keys, default, order_dict, missing)) series.rename(keys, inplace=True) return series
def fit(cls, graph, X, y, **kwargs): score_handler = dict_utils.get(kwargs, 'score_handler', _default_fit_score_handler) kwargs['score_handler'] = score_handler population_cal_field, scores = cls._fit(graph, X, y, **kwargs) score_data = { 'pred_score_key': score_handler.pred_score_key, 'color_score_key': score_handler.color_score_key, 'population_cal_field_key': score_handler.population_cal_field_key, score_handler.population_cal_field_key: population_cal_field } return graph.update_graph(score=scores, new_graph_info={ 'score_data': score_data, '_fitted': True, '_internally_fitted': True }, inplace=False)
def compare_fit(self, X, y, **kwargs): data_helper = dict_utils.get(kwargs, 'data_helper', DataHelper) return data_helper.compare_fit(self, X, y, **kwargs)
def predict(self, X, **kwargs): data_helper = dict_utils.get(kwargs, 'data_helper', DataHelper) return data_helper.predict(self, X, **kwargs)
def set_defaults(tree, kwargs): kwargs['tree_index'] = int(dict_utils.get(kwargs, 'tree_index', 0))
def get_score(tree_iter, node, is_leaf, kwargs): return dict_utils.get(tree_iter.nodes[node], 'score')
def get_data(tree_iter, node, is_leaf, kwargs): return dict_utils.get(tree_iter.nodes[node], 'data')
def set_defaults(tree, kwargs): kwargs['root_id'] = dict_utils.get(kwargs, 'root_id', None)