def reduce_chain_len(cluster_chain, max_depth): """ Parameters ---------- max_depth: int Max depth of final cluster chain cluster_chain: list list of cluster chain Returns ------- cluster chain with given max depth """ if isinstance(cluster_chain, cluster_util.ClusterChain): cluster_chain = cluster_chain.chain assert isinstance(cluster_chain, list) n_levels = len(cluster_chain) for level in range(n_levels, max_depth, -1): last_mat = cluster_chain[level - 1] sec_lat_mat = cluster_chain[level - 2] new_mat = last_mat.dot(sec_lat_mat) cluster_chain = [mat for mat in cluster_chain[:level - 2]] + [new_mat] assert len(cluster_chain) == max_depth cluster_chain = cluster_util.ClusterChain(cluster_chain) return cluster_chain
def gen( cls, feat_mat, nr_splits=2, max_leaf_size=100, imbalanced_ratio=0.0, imbalanced_depth=100, spherical=True, seed=0, max_iter=20, threads=-1, dtype=sp.float32, mlc_mats=[], use_freq=True, **kwargs, ): if nr_splits != 2: raise NotImplementedError cluster_chain = hierarchical_kmeans_w_mlc( feat_mat=feat_mat, mlc_mats=mlc_mats, use_freq=use_freq, max_leaf_size=max_leaf_size, imbalanced_ratio=imbalanced_ratio, imbalanced_depth=imbalanced_depth, spherical=spherical, seed=seed, max_iter=max_iter, threads=threads, ) cluster_chain = cluster_util.ClusterChain(cluster_chain) return cluster_chain
def _extend_to_depth(chain, depth): """Extends a cluster chain to a given depth.""" if depth < len(chain): return chain num_req = depth - len(chain) num_codes = chain[-1].shape[1] new_chain = chain[0:len(chain) - 1] for i in range(num_req): new_chain.append(smat.identity(num_codes, dtype=np.float32).tocsc()) new_chain.append(chain[-1]) return cluster_util.ClusterChain(new_chain)
def gen( cls, feat_mat, label_strs=[], depth=1, spherical=True, max_iter=20, max_leaf_size=100, seed=0, **kwargs, ): # try: is_sorted = all(label_strs[i] <= label_strs[i + 1] for i in range(len(label_strs) - 1)) if not is_sorted: raise Exception( "label_strs should be sorted in order to build a cluster matrices correctly. " "If not sorted then rows in last matrix in cluster chain will not correspond to " "columns in training- data label matrix ") LOGGER.info("Starting Hybrid-Trie Indexing") trie = TrieWrapper() trie.update({lstr: 1 for lstr in label_strs}) LOGGER.info( "Added all labels to trie. Now building trie till depth = {}". format(depth)) trie_chain = trie.build_cluster_chain(depth=depth) flat_clust = smat.csc_matrix( trie_chain.chain[-1] ) # Use last mat in chain to define flat clustering LOGGER.info("Flat clust shape :{}".format(flat_clust.shape)) remaining_chain = PreClusteredHierarchicalKMeans.gen( feat_mat=feat_mat, init_mat=flat_clust, hierarchical_codes=False, spherical=spherical, max_leaf_size=max_leaf_size, max_iter=max_iter, seed=seed, ) LOGGER.info("Built remaining cluster chain using HC 2-means :".format( flat_clust.shape)) final_chain = cluster_util.ClusterChain(trie_chain.chain[:-1] + remaining_chain.chain[1:]) return final_chain
def build_cluster_chain(self, depth): cluster_chain = self._build_sparse_cluster_chain_helper(depth=depth) assert len(cluster_chain) == depth + 1 # Merge all child cluster chains level wise new_chain = [] for curr_level in range(depth + 1): mats_to_merge = cluster_chain[curr_level] row_offsets = np.concatenate( ([0], np.cumsum([mat.shape[0] for mat in mats_to_merge]))) col_offsets = np.concatenate( ([0], np.cumsum([mat.shape[1] for mat in mats_to_merge]))) total_n_rows = row_offsets[-1] total_n_cols = col_offsets[-1] all_rows = [ mat.row + row_offset for mat, row_offset in zip(mats_to_merge, row_offsets) ] all_cols = [ mat.col + col_offset for mat, col_offset in zip(mats_to_merge, col_offsets) ] new_row_idxs = np.concatenate(all_rows) new_col_idxs = np.concatenate(all_cols) assert len(new_row_idxs) == len(new_col_idxs) new_data = np.ones((len(new_row_idxs))) new_mat = smat.csr_matrix( (new_data, (new_row_idxs, new_col_idxs)), shape=(total_n_rows, total_n_cols), dtype=sp.float32, ) new_chain.append(new_mat) cluster_chain = cluster_util.ClusterChain(new_chain) return cluster_chain
def gen( cls, feat_mat, init_mat, kdim=2, max_leaf_size=100, spherical=True, seed=0, max_iter=20, threads=-1, hierarchical_codes=True, **kwargs, ): """Main clustering function. Parameters: ---------- feat_mat: smat.csr_matrix label features to be used for clustering. init_mat: smat.csc_matrix initial pre-clustering as sparse matrix, rows are labels and columns are codes. kdim: int number of children for each parent node spherical: bool true: use spherical kmeans false: use regular kmeans seed: int random seed max_iter: int max. iterations for clustering threads: int number of cores to be used hierarchical_codes: bool true: make a hierarchical tree from roots to codes false: root branches into codes directly Returns: ------- cluster_chain representing the final clustering. """ if not isinstance(feat_mat, smat.csr_matrix): raise ValueError("feat_mat does not follow correct input format") if feat_mat.dtype != np.float32: raise ValueError("feat_mat does not follow correct data type") if not isinstance(init_mat, smat.csc_matrix): raise ValueError("init_mat does not follow correct input format") if init_mat.dtype != np.float32: raise ValueError("init_mat does not follow correct data type") label_order = [] all_cluster_chains = [] for code in range(init_mat.shape[1]): LOGGER.info( "Training hierarchical clustering for code: {}".format(code)) rel_labels = init_mat.indices[init_mat.indptr[code]:init_mat. indptr[code + 1]] rel_feat = feat_mat[rel_labels, :] all_cluster_chains.append(cls.indexer_dict["hierarchicalkmeans"].gen( feat_mat=rel_feat, kdim=kdim, max_leaf_size=max_leaf_size, imbalanced_ratio= 0.0000000000000000001, # Passing non-zero but very very small imbalanced ratio to avoid error thrown by PECOS package when a branch has just a single label. spherical=spherical, seed=seed, threads=threads, max_iter=max_iter, )) label_order += list(rel_labels) if hierarchical_codes: final_cluster_chain = _index_clusters(feat_mat, init_mat) else: final_cluster_chain = [ smat.csc_matrix( np.ones((init_mat.shape[1], 1)), dtype=np.float32, ) ] max_depth = max(len(c_chain) for c_chain in all_cluster_chains) all_cluster_chains = [ _extend_to_depth(c_chain, max_depth) for c_chain in all_cluster_chains ] for d in range(max_depth): LOGGER.info("Joining matrices at depth {}".format(d)) mat_list = [ all_cluster_chains[cluster][d] for cluster in range(len(all_cluster_chains)) ] final_cluster_chain.append(_block_join(mat_list)) inverse = [0] * len(label_order) for i, p in enumerate(label_order): inverse[p] = i final_cluster_chain[-1] = final_cluster_chain[-1].tocsr()[ inverse, :].tocsc() return cluster_util.ClusterChain(final_cluster_chain)