def gen_hierarchy(self, data): """ Create a new feature hierarchy: (i) from input hierarchy if available, and (ii) from feature set if not """ num_features = data.shape[1] if self.feature_hierarchy is None: # Create hierarchy if not available if self.feature_names is None: # Generate feature names if not available self.feature_names = [f"{idx}" for idx in range(num_features)] root = Feature(constants.DUMMY_ROOT, description=constants.DUMMY_ROOT, perturbable=False) # Dummy root node, shouldn't be perturbed for idx, feature_name in enumerate(self.feature_names): Feature(feature_name, parent=root, idx=[idx]) self.feature_hierarchy = root else: # TODO: Document real hierarchy with examples # Input hierarchy needs a list of indices assigned to all base features # Create hierarchy over features from input hierarchy if isinstance(self.feature_hierarchy, str): # JSON hierarchy - import to anytree try: importer = JsonImporter() with open(self.feature_hierarchy, encoding="utf-8") as hierarchy_file: self.feature_hierarchy = importer.read(hierarchy_file) except JSONDecodeError as error: raise ValueError(f"Feature hierarchy {self.feature_hierarchy} does not appear to be a valid JSON file:") from error assert isinstance(self.feature_hierarchy, anytree.node.nodemixin.NodeMixin), "Feature hierarchy does not appear to be a valid JSON file or an anytree node" feature_nodes = {} all_idx = set() # Parse and validate input hierarchy for node in anytree.PostOrderIter(self.feature_hierarchy): idx = [] if node.is_leaf: valid = (hasattr(node, "idx") and isinstance(node.idx, list) and len(node.idx) >= 1 and all(isinstance(node.idx[i], int) for i in range(len(node.idx)))) assert valid, f"Leaf node {node.name} must contain a non-empty list of integer indices under attribute 'idx'" assert not all_idx.intersection(node.idx), f"Leaf node {node.name} has index overlap with other leaf nodes" idx = node.idx all_idx.update(idx) else: # Ensure internal nodes have empty initial indices valid = not hasattr(node, "idx") or not node.idx assert valid, f"Internal node {node.name} must have empty initial indices under attribute 'idx'" description = getattr(node, "description", "") feature_nodes[node.name] = Feature(node.name, description=description, idx=idx) # Update feature group (internal node) indices and tree connections assert min(all_idx) >= 0 and max(all_idx) < num_features, "Feature indices in hierarchy must be in range [0, num_features - 1]" feature_node = None for node in anytree.PostOrderIter(self.feature_hierarchy): feature_node = feature_nodes[node.name] parent = node.parent if parent: feature_node.parent = feature_nodes[parent.name] for child in node.children: feature_node.idx += feature_nodes[child.name].idx self.feature_hierarchy = Feature(constants.DUMMY_ROOT, children=[feature_node], perturbable=False) # Dummy root node for consistency with flat hierarchy; last feature_node is original root
def simplify_tree(self): """ Goes through the tree and gets rid of nodes with only one child. Replaced them with their only child. Purges not needed Nodes from the node_dictionary See Algorithm 1 (line 11) """ nodes = [node for node in at.PostOrderIter(self.root)] for node in nodes: children = node.children if len(children) == 1: children[0].parent = node.parent if node.parent is None: # The current root has only one child - let's replace it # with the only child self.root = children[0] node.parent = None try: self.node_dictionary.pop(node.name) except KeyError: print(node.name) try: self.category_dictionary.pop(node.name) except KeyError: print(node.name) # recompute distances for the entities nodes = [node for node in at.PostOrderIter(self.root)] for node in nodes: if node.is_leaf: node._compute_distances()
def update_hierarchy_relevance(hierarchy_root, relevant_feature_map, probs): """ Add feature relevance information to nodes of hierarchy: their probabilty of being enabled, their polynomial coefficient """ relevant_features = set() for key in relevant_feature_map: relevant_features.update(key) for node in anytree.PostOrderIter(hierarchy_root): node.description = constants.IRRELEVANT if node.is_leaf: idx = int(node.static_indices) node.poly_coeff = 0.0 node.bin_prob = probs[idx] coeff = relevant_feature_map.get(frozenset([idx])) if coeff: node.poly_coeff = coeff node.description = ( "%s feature:\nPolynomial coefficient: %f\nBinomial probability: %f" % (constants.RELEVANT, coeff, probs[idx])) elif idx in relevant_features: node.description = ("%s feature\n(Interaction-only)" % constants.RELEVANT) else: for child in node.children: if child.description != constants.IRRELEVANT: node.description = constants.RELEVANT
def merkle_branch(hash_, tree): tree_walker = anytree.PostOrderIter(tree) node = next((node for node in tree_walker if node.name == hash_), None) if not node: return None # TODO come up with a sane branch resprentation return node
def get_input_wires(self): if not self.input_wires: leaves = [node.name for node in anytree.PostOrderIter(self.tree) if node.is_leaf] input_wires = [wire for leaf in leaves for wire in [leaf.left_wire, leaf.right_wire]] self.input_wires = input_wires return self.input_wires
def prune_empty_groups(tree: LayerGroupNode, ) -> LayerGroupNode: """Remove any leaf nodes which are not LayerNodes.""" lt = copy.deepcopy(tree) # NOTE: We MUST iterate using PostOrderIter(lt) instead of lt.leaves # because removing a leaf group may leave its parent group newly enleafened. for node in anytree.PostOrderIter(lt): if node.is_leaf and type(node) is not LayerNode: logger.warn(f'{node.group_name_path=}, {node.name=}') _delete_node(node, msg='Removing empty group') return lt
def evaluate_hierarchical(args, sfeatures, afeatures): """ Evaluate hierarchical analysis results - obtain power/FDR measures for all nodes/base features """ # pylint: disable = too-many-locals # Map features in hierarchy to original features and identify ground-truth importances/scores sfeatures_map = {sfeature.name: sfeature for sfeature in sfeatures} importance_map = {} score_map = {} for node in anytree.PostOrderIter(afeatures[0].root): if node.is_leaf: sfeature_name = str(node.idx[0]) importance_map[node.name] = sfeatures_map[sfeature_name].important score_map[node.name] = sfeatures_map[sfeature_name].effect_size else: importance_map[node.name] = any(importance_map[child.name] for child in node.children) # Overall FDR/power important = np.zeros(len(afeatures)) inferred_important = np.zeros(len(afeatures)) for idx, afeature in enumerate(afeatures): important[idx] = importance_map[afeature.name] inferred_important[idx] = afeature.important imp_precision, imp_recall = get_precision_recall(important, inferred_important) # Base features FDR/power base_features = list(filter(lambda node: node.is_leaf, afeatures)) base_important = np.zeros(len(base_features)) inferred_base_important = np.zeros(len(base_features)) for idx, base_feature in enumerate(base_features): base_important[idx] = importance_map[base_feature.name] inferred_base_important[idx] = base_feature.important base_imp_precision, base_imp_recall = get_precision_recall(base_important, inferred_base_important) # Importance scores for base features overall_scores_corr, overall_relevant_scores_corr = (1.0, 1.0) if args.model_type == REGRESSOR: scores = np.zeros(len(base_features)) inferred_scores = np.zeros(len(base_features)) for idx, base_feature in enumerate(base_features): scores[idx] = score_map[base_feature.name] inferred_scores[idx] = base_feature.importance_score relevant_base_features = list(filter(lambda node: importance_map[node.name], base_features)) relevant_scores = np.zeros(len(relevant_base_features)) relevant_inferred_scores = np.zeros(len(relevant_base_features)) for idx, relevant_base_feature in enumerate(relevant_base_features): relevant_scores[idx] = score_map[relevant_base_feature.name] relevant_inferred_scores[idx] = relevant_base_feature.importance_score overall_scores_corr = pearsonr(scores, inferred_scores)[0] if len(scores) >= 2 else 1 overall_relevant_scores_corr = pearsonr(relevant_scores, relevant_inferred_scores)[0] if len(relevant_scores) >= 2 else 1 vals = {FDR: 1 - imp_precision, POWER: imp_recall, BASE_FEATURES_FDR: 1 - base_imp_precision, BASE_FEATURES_POWER: base_imp_recall, OVERALL_SCORES_CORR: overall_scores_corr, OVERALL_RELEVANT_SCORES_CORR: overall_relevant_scores_corr} return {key: value if isinstance(value, dict) else round(value, 10) for key, value in vals.items()} # Round values to avoid FP discrepancies
def delete_nodes(root_node, dry_run, delete_root, stop_node="root"): """ Cascade deletes OSDF nodes in a depth-first manner. Args: root_node (anytree.Node): The root node of the tree to delete from. dry_run (boolean): True/False to delete or print out the nodes to be deleted. stop_node (string): Name of the node to stop deletion on. Defaults to the root node but can be any OSDF ID to stop on. delete_root (boolean): If the stop_node parameter is set to 'root' this parameter can be passed to indicate we want to delete the root node as well. Requires: None Returns: None """ deleted = [] failed_delete = [] for node in anytree.PostOrderIter(root_node): if dry_run: if not node.name == "root": print "DELETING NODE:", node else: osdf_obj = node.osdf if not node.name == "root": print "DELETING NODE:", node res = osdf_obj.delete() if not res: print "FAILED TO DELETE NODE:", node failed_delete.append(osdf_obj) if failed_delete: print "WARNING: The following OSDF nodes were not deleted:" + "\n".join(failed_delete) if delete_root and stop_node == "root": print "DELETING ROOT NODE:", root_node if not dry_run: root_node.osdf.delete()
def gen_hierarchy(args, clustering_data): """ Generate hierarchy over features Args: args: Command-line arguments clustering_data: Data potentially used to cluster features (depending on hierarchy generation method) Returns: hierarchy_root: root fo resulting hierarchy over features """ # TODO: Get rid of possibly redundant hierarchy attributes e.g. vidx # Generate hierarchy hierarchy_root = None if args.hierarchy_type == constants.FLAT: args.contiguous_node_names = False # Flat hierarchy should be automatically created; do not re-index hierarchy elif args.hierarchy_type == constants.CLUSTER_FROM_DATA: clusters = cluster_data(clustering_data) hierarchy_root = gen_hierarchy_from_clusters(args, clusters) elif args.hierarchy_type == constants.RANDOM: hierarchy_root = gen_random_hierarchy(args) else: raise NotImplementedError("Need valid hierarchy type") # Improve visualization - contiguous feature names feature_id_map = {} # mapping from visual feature ids to original ids if args.contiguous_node_names: for idx, node in enumerate(anytree.PostOrderIter(hierarchy_root)): node.vidx = idx if node.is_leaf: node.min_child_vidx = idx node.max_child_vidx = idx node.num_base_features = 1 node.name = str(idx) feature_id_map[idx] = node.idx[0] else: node.min_child_vidx = min( [child.min_child_vidx for child in node.children]) node.max_child_vidx = max( [child.vidx for child in node.children]) node.num_base_features = sum( [child.num_base_features for child in node.children]) node.name = "[%d-%d] (size: %d)" % (node.min_child_vidx, node.max_child_vidx, node.num_base_features) return hierarchy_root, feature_id_map
def compare_with_ground_truth(args, hierarchy_root): """Compare results from mihifepe with ground truth results""" # Generate ground truth results # Write hierarchical FDR input file for ground truth values args.logger.info("Compare mihifepe results to ground truth") input_filename = "%s/ground_truth_pvalues.csv" % args.output_dir with open(input_filename, "w", newline="") as input_file: writer = csv.writer(input_file) writer.writerow([ constants.NODE_NAME, constants.PARENT_NAME, constants.PVALUE_LOSSES, constants.DESCRIPTION ]) for node in anytree.PostOrderIter(hierarchy_root): parent_name = node.parent.name if node.parent else "" # Decide p-values based on rough heuristic for relevance node.pvalue = 1.0 if node.description != constants.IRRELEVANT: if node.is_leaf: node.pvalue = 0.001 if node.poly_coeff: node.pvalue = min( node.pvalue, 1e-10 / (node.poly_coeff * node.bin_prob)**3) else: node.pvalue = 0.999 * min( [child.pvalue for child in node.children]) writer.writerow( [node.name, parent_name, node.pvalue, node.description]) # Generate hierarchical FDR results for ground truth values ground_truth_dir = "%s/ground_truth_fdr" % args.output_dir cmd = ( "python -m mihifepe.fdr.hierarchical_fdr_control -output_dir %s -procedure yekutieli " "-rectangle_leaves %s" % (ground_truth_dir, input_filename)) args.logger.info("Running cmd: %s" % cmd) pass_args = cmd.split()[2:] with patch.object(sys, 'argv', pass_args): hierarchical_fdr_control.main() # Compare results ground_truth_outputs_filename = "%s/%s.png" % (ground_truth_dir, constants.TREE) args.logger.info("Ground truth results: %s" % ground_truth_outputs_filename) mihifepe_outputs_filename = "%s/%s/%s.png" % ( args.output_dir, constants.HIERARCHICAL_FDR_DIR, constants.TREE) args.logger.info("mihifepe results: %s" % mihifepe_outputs_filename)
def prune(self, error_bound): clf = self.copy() def compute_error(clf, original, alternative): # just compute the error if no switch is needed if original == alternative: return error_bound(clf) # switch the original node with the alternative in clf old_parents = (original.parent, alternative.parent) Node.replace(original, alternative) if old_parents[0] is None: clf.tree = alternative # compute the error with the resulting tree output = error_bound(clf) # return to the original tree Node.replace(alternative, original, old_parents[-1]) if old_parents[0] is None: clf.tree = original return output # go over all nodes in a bottom-up manner for node in anytree.PostOrderIter(clf.tree): # add all alternatives to a list alternatives = [] for label in clf.label_values: alternatives.append(Node(label)) for child in node.children: alternatives.append(child) alternatives.append(node) # find the alternative which minimizes the bound on the error best_alternative = argmin(lambda x: compute_error(clf, node, x), alternatives) if best_alternative != node: Node.replace(node, best_alternative) node = best_alternative if best_alternative.is_root: clf.tree = best_alternative return clf
def update_hierarchy_descriptions(hierarchy_root, relevant_feature_map, features): """ Add feature relevance information to nodes of hierarchy """ relevant_features = set() for key in relevant_feature_map: relevant_features.update(key) for node in anytree.PostOrderIter(hierarchy_root): node.description = constants.IRRELEVANT if node.is_leaf: idx = node.idx[0] coeff = relevant_feature_map.get(frozenset([idx])) if coeff: node.description = f"{constants.RELEVANT} feature:\nPolynomial coefficient: {coeff}\nSummary: {features[idx].summary()}" elif idx in relevant_features: node.description = f"{constants.RELEVANT} feature\n(Interaction-only)\nSummary: {features[idx].summary()}" else: for child in node.children: if child.description != constants.IRRELEVANT: node.description = constants.RELEVANT
root = root.children[0] # Identify the workspaces for display in root.children: for wk in display.children[0].children: wk.workspace = True # Get the current workspace proc_out = subprocess.run(['swaymsg', '-t', 'get_workspaces'], stdout=subprocess.PIPE) wkList = json.loads(proc_out.stdout.decode('utf-8')) focWkName = nf.getFocusedWK(wkList) # Change the tree such that the workspaces are children to the root # while ignoring the current workspace root.children = [node for node in at.PostOrderIter(root, filter_=lambda x: x.workspace) if node.id != focWkName] # If workspace contains only one container, then remove that container for node in at.PostOrderIter(root, filter_=lambda x: x.workspace): if len(node.children) == 1: node.children = node.children[0].children # If containers have only one element, then remove such containers for node in at.PreOrderIter(root, filter_=lambda x: x.container): if len(node.children) == 1: node.children[0].parent = node.parent node.parent = None # Create names for containers for node in at.PreOrderIter(root, filter_=lambda x: x.container):