def check_is_valid_linkage_various_size(self, nrow, ncol, valid): # Tests is_valid_linkage(Z) with linkage matrics of various sizes Z = np.asarray([[0, 1, 3.0, 2, 5], [3, 2, 4.0, 3, 3]], dtype=np.double) Z = Z[:nrow, :ncol] assert_(is_valid_linkage(Z) == valid) if not valid: assert_raises(ValueError, is_valid_linkage, Z, throw=True)
def test_is_valid_linkage_4_and_up(self): # Tests is_valid_linkage(Z) on linkage on observation sets between # sizes 4 and 15 (step size 3). for i in xrange(4, 15, 3): y = np.random.rand(i*(i-1)//2) Z = linkage(y) assert_(is_valid_linkage(Z) == True)
def test_is_valid_linkage_4_and_up_neg_counts(self): # Tests is_valid_linkage(Z) on linkage on observation sets between # sizes 4 and 15 (step size 3) with negative counts. for i in xrange(4, 15, 3): y = np.random.rand(i*(i-1)//2) Z = linkage(y) Z[i//2,3] = -2 assert_(is_valid_linkage(Z) == False) assert_raises(ValueError, is_valid_linkage, Z, throw=True)
def test_is_valid_linkage_empty(self): # Tests is_valid_linkage(Z) with empty linkage. Z = np.zeros((0, 4), dtype=np.double) assert_(is_valid_linkage(Z) == False) assert_raises(ValueError, is_valid_linkage, Z, throw=True)
def test_is_valid_linkage_int_type(self): # Tests is_valid_linkage(Z) with integer type. Z = np.asarray([[0, 1, 3.0, 2], [3, 2, 4.0, 3]], dtype=np.int) assert_(is_valid_linkage(Z) == False) assert_raises(TypeError, is_valid_linkage, Z, throw=True)
def _to_dtw_tree(linkage, hierarchical_clustering_object, prototypes, prototyping_function='mean'): """ Converts a hierarchical clustering linkage matrix `linkage` to hierarchy of `DTWClusterNode`s. This is a modification of `scipy.cluster.hierarchy.to_tree` function and the code is mostly taken from it. :param linkage: linkage matrix to convert to the DTW Tree :param hierarchical_clustering_object: hierarchical clustering object to work with :param prototyping_function: "reduce" function for prototype calculation, or "mean" to simply use data mean """ # Validation linkage = np.asarray(linkage, order='c') hierarchy.is_valid_linkage(linkage, throw=True, name='Z') data = hierarchical_clustering_object.data labels = data.items values = data.ix n = linkage.shape[0] + 1 # Create a list full of None's to store the node objects d = [None] * (n * 2 - 1) # Create the nodes corresponding to the n original objects. for i in xrange(0, n): index = labels[i] d[i] = DTWClusterNode(id=index, hierarchical_clustering_object=hierarchical_clustering_object, prototype=values[index]) nd = None for i in xrange(0, n - 1): fi = int(linkage[i, 0]) fj = int(linkage[i, 1]) assert(fi <= i + n) assert(fj <= i + n) id = i + n left = d[fi] right = d[fj] dist = linkage[i, 2] if prototypes: prototype = prototypes[id] nd = DTWClusterNode(id=id, hierarchical_clustering_object=hierarchical_clustering_object, prototype=prototype, left=left, right=right, dist=linkage[i, 2]) elif callable(prototyping_function): prototype = prototyping_function(left.prototype.values, right.prototype.values, left.count, right.count) nd = DTWClusterNode(id=id, hierarchical_clustering_object=hierarchical_clustering_object, prototype=prototype, left=left, right=right, dist=linkage[i, 2]) elif prototyping_function == 'mean': nd = DTWClusterNode(id=id, hierarchical_clustering_object=hierarchical_clustering_object, prototype=None, left=left, right=right, dist=linkage[i, 2]) # A bit hacky, but does job. Doing this as to get to use nd.data nd._prototype = nd.data.mean() assert(linkage[i, 3] == nd.count) d[n + i] = nd return nd, d
def assign_domain_cluster_to_compartments(coordinates, domain_starts, compartment_dict, domain_linkage=None, linkage_method='complete', distance_metric='median', normalization=None, min_cluster_size_ratio=0.1, min_cluster_dist_ratio=0.08, assign_method='binary', return_boundary=True, verbose=True): """Function to assign domain clusters to given compartments in compartment_dict Idea: 1. find normalized overlap ratio between domain_cluster and reference_compartment, 2. assign bestmatch for each cluster ------------------------------------------------------------------------------------------ Inputs: coordinates: distance map or zxy coordinates for a chromosome, np.ndarray (or like) domain_starts: indices of domain start regions in this chromosome, np.ndarray(1d) compartment_dict: dictionary for compartment annotation, dict Note: this comaprtment_dict has to be exclusive domain_linkage: linkage matrix generated from scipy.cluster.hierarchy.linkage, np.ndarray (linkage result, default:None, generate from scratch) linkage_method: method for linkage if domain_linkage is not given, str (default: 'complete') distance_metric: metric for domain distance calculation, str (default: 'median') min_cluster_size_ratio: minimal size of cluster ratio to chromosome size, float (default: 0.1) min_cluster_dist_ratio: minimal distance of cluster ratio to number of domains, float (default: 0.05) assign_method: method for assigning compartments, str {'binary'|'continuous'} verbose: whether say something!, bool (default: True) Output: _assigned_dict: assigned compartment label -> region id list dictionary, dict """ ## check inputs # coordinate coordinates = np.array(coordinates) if verbose: print(f"-- assign domain-clusters to compartments with", end=' ') if len(np.shape(coordinates)) != 2: raise ValueError( f"Wrong input shape for coordinates, should be 2d but {len(np.shape(coordinates))} is given" ) elif np.shape(coordinates)[0] == np.shape(coordinates)[1]: if verbose: print(f"distance map") _mat = coordinates elif np.shape(coordinates)[1] == 3: if verbose: print(f"3d coordinates") _mat = squareform(pdist(coordinates)) else: raise ValueError( f"Input coordinates should be distance-matrix or 3d-coordinates!") # domain_starts domain_starts = np.array(domain_starts, dtype=np.int) for _s in domain_starts: if _s < 0 or _s > _mat.shape[0]: raise ValueError( f"Wrong input domain_starts: {_s}, should be index of coordinates" ) domain_ends = np.zeros(np.shape(domain_starts)) domain_ends[:-1] = domain_starts[1:] domain_ends[-1] = _mat.shape[0] # compartment_dict _ref_inds = [] for _k, _v in compartment_dict.items(): _ref_inds += list(_v) _uids, _ucounts = np.unique(_ref_inds, return_counts=True) if (_ucounts > 1).any(): raise ValueError( f"There are non unique ids used in reference:{compartment_dict}") elif (_uids > _mat.shape[0]).any(): raise ValueError( f"Wrong ind given in compartment_dict:{compartment_dict}, should be index of coordinates" ) # domain_linkage if domain_linkage is not None and not is_valid_linkage(domain_linkage): raise ValueError( f"domain_liknage should be a linkage type array from scipy.cluster.hierarchy.linkage" ) elif domain_linkage is None: _dom_pdists = domain_tools.distance.domain_pdists( coordinates, domain_starts, metric=distance_metric, normalization_mat=normalization) _cov_mat = np.corrcoef(squareform(_dom_pdists)) try: domain_linkage = linkage(_cov_mat, method=linkage_method) except ValueError: print(f"failed to build linkage, exit.") if return_boundary: return None, None else: return None # assign_method _allowed_assign_method = ['binary', 'continuous'] assign_method = str(assign_method).lower() if assign_method not in _allowed_assign_method: raise ValueError( f"Wrong input assign_method:{assign_method}, should be within {_allowed_assign_method}" ) ## 1. acquire exclusive clusters # get all subnodes _rootnode, _nodelist = to_tree(domain_linkage, rd=True) # get selection threshold _dist_th = len(domain_starts) * min_cluster_dist_ratio if verbose: print(f"--- threshold for cluster distance={_dist_th}") # init kept clusters _kept_clusters = [] for _node in _nodelist: _kept_leafs = [] for _n in _kept_clusters: _kept_leafs += list(_n.pre_order(lambda x: x.id)) _left_flag, _right_flag = True, True if not _node.is_leaf() and _node.dist > _dist_th: for _r in _node.left.pre_order(lambda x: x.id): if _r in _kept_leafs: _left_flag = False continue for _r in _node.right.pre_order(lambda x: x.id): if _r in _kept_leafs: _right_flag = False continue # otherwise, keep if _left_flag: _kept_clusters.append(_node.left) if _right_flag: _kept_clusters.append(_node.right) # convert domain ID to region_id _reg_id_list = [] for _n in _kept_clusters: _dom_ids = np.array(_n.pre_order(lambda x: x.id), dtype=np.int) _reg_ids = [ np.arange(domain_starts[_d], domain_ends[_d]).astype(np.int) for _d in _dom_ids ] _reg_id_list.append(np.concatenate(_reg_ids)) ## 2. with selected clusters, calculate its overlap with compartments # init _decision_dict = { _k: np.zeros(len(_reg_id_list)) for _k in compartment_dict.keys() } for _ckey, _cinds in compartment_dict.items(): for _j, _rids in enumerate(_reg_id_list): _decision_dict[_ckey][_j] = len(np.intersect1d( _rids, _cinds)) / len(_rids) / len(_cinds) if verbose: print("--- decision_dict:", _decision_dict) ## summarize to a dict _assigned_dict = { _k: np.zeros(_mat.shape[0]) for _k in compartment_dict.keys() } _keys = list(compartment_dict.keys()) if assign_method == 'binary': for _j, _rids in enumerate(_reg_id_list): _match_ind = np.argmax( [_v[_j] for _k, _v in _decision_dict.items()]) _assigned_dict[_keys[_match_ind]][_rids] = 1 elif assign_method == 'continuous': _norm_mat = np.stack([_v for _k, _v in _decision_dict.items()]) _norm_mat = _norm_mat / np.sum(_norm_mat, 0) _norm_mat[np.isnan(_norm_mat)] = 0 for _j, _rids in enumerate(_reg_id_list): for _i, _k in enumerate(_keys): _assigned_dict[_k][_rids] = _norm_mat[_i, _j] # return if return_boundary: # calculate compartment boundaries _boundary_dict = {_k: [] for _k in compartment_dict.keys()} for _k, _v in _assigned_dict.items(): _bds = np.where((_v[1:] - _v[:-1]) > 0)[0] + 1 _boundary_dict[_k] = _bds _cluster_bds = np.concatenate(list(_boundary_dict.values())) _cluster_bds = np.unique(_cluster_bds) return _assigned_dict, _cluster_bds else: return _assigned_dict
def fcluster_combine_leaves(Z, t, criterion="distance", depth=2, R=None, monocrit=None): # AKA no leaf left behind # check if Z is a valid linkage matrix _ = hierarchy.is_valid_linkage(Z, throw=True) N = Z.shape[0] + 1 # alternative: iteratively increase t, check for remaining leaves # move up the tree, merging leaf clusters until all leaves are merged into clusters T = hierarchy.fcluster(Z, t, criterion=criterion, depth=depth, R=R, monocrit=monocrit) L, M = hierarchy.leaders(Z, T) leaf_leaders = list(L[L < N]) # no leaf clusters if len(leaf_leaders) == 0: return T max_cluster = T.max() # iterate through all links for n, link in enumerate( Z[np.logical_or(*(np.in1d(Z[:, l], leaf_leaders) for l in range(2))), :2].astype("i")): if n % 10 == 0: print( f"After {n} iterations, {len(leaf_leaders)} leaf leaders left with {len(np.unique(T))} total clusters" ) # find linkages if link is between two leaf_leaders if all([l in leaf_leaders for l in link]): # make new cluster of leaf leaders max_cluster += 1 T[link] = max_cluster # remove from list of leaf_leaders _ = [leaf_leaders.remove(l) for l in link] # find linkages of leaf leaders with any non-leaf node elif any([l in leaf_leaders for l in link]): # which one is the leaf leader? node_index = link[0] in leaf_leaders node, leaf = link[int(node_index)], link[int(~node_index)] # other node is a leader if node in L: downstream_leaders = [node] # node is not a leader, have to traverse down the tree until leaders are found else: # get hierarchy.ClusterNode representation of the node tree = hierarchy.to_tree(Z, rd=True)[1][node] def check_node(node, nodes_to_check, downstream_leaders, L): """check if a node is a leader, else append successors to nodes_to_check""" if node.id in L: downstream_leaders.append(node.id) else: nodes_to_check.extend([node.left, node.right]) return nodes_to_check, downstream_leaders # initialize traversal downstream_leaders = [] nodes_to_check = [tree.left, tree.right] while len(nodes_to_check) > 0: n_ = nodes_to_check.pop(0) if all([s is None for s in [n_.left, n_.right]]): raise ValueError( "While traversing the tree, a leaf node was reached" f", node {n_.id}. In theory this should not occur." ) nodes_to_check, downstream_leaders = check_node( n_, nodes_to_check, downstream_leaders, L) # update T max_cluster += 1 merge_clusters = M[np.in1d(L, downstream_leaders)] T[np.in1d(T, merge_clusters)] = max_cluster T[leaf] = max_cluster # remove from leaf_leaders _ = leaf_leaders.remove(leaf) else: continue # update L,M L, M = hierarchy.leaders(Z, T) if len(leaf_leaders) == 0: break leaf_leaders = list(L[L < N]) # no leaf clusters if len(leaf_leaders) == 0: print( f"All leaf leaders combined, resulting in {len(np.unique(T))} total clusters" ) # relabel unique, inverse = np.unique(T, return_inverse=True) return np.arange(0, unique.shape[0])[inverse] else: raise ValueError(f"Failed to merge leaf leaders {leaf_leaders}")