コード例 #1
0
ファイル: indices.py プロジェクト: OctoberChang/pecos
def reduce_chain_len(cluster_chain, max_depth):
    """

    Parameters
    ----------
    max_depth: int
        Max depth of final cluster chain
    cluster_chain: list
        list of cluster chain
    Returns
    -------
        cluster chain with given max depth
    """

    if isinstance(cluster_chain, cluster_util.ClusterChain):
        cluster_chain = cluster_chain.chain

    assert isinstance(cluster_chain, list)
    n_levels = len(cluster_chain)
    for level in range(n_levels, max_depth, -1):
        last_mat = cluster_chain[level - 1]
        sec_lat_mat = cluster_chain[level - 2]
        new_mat = last_mat.dot(sec_lat_mat)

        cluster_chain = [mat for mat in cluster_chain[:level - 2]] + [new_mat]

    assert len(cluster_chain) == max_depth
    cluster_chain = cluster_util.ClusterChain(cluster_chain)
    return cluster_chain
コード例 #2
0
ファイル: indices.py プロジェクト: OctoberChang/pecos
    def gen(
        cls,
        feat_mat,
        nr_splits=2,
        max_leaf_size=100,
        imbalanced_ratio=0.0,
        imbalanced_depth=100,
        spherical=True,
        seed=0,
        max_iter=20,
        threads=-1,
        dtype=sp.float32,
        mlc_mats=[],
        use_freq=True,
        **kwargs,
    ):
        if nr_splits != 2:
            raise NotImplementedError

        cluster_chain = hierarchical_kmeans_w_mlc(
            feat_mat=feat_mat,
            mlc_mats=mlc_mats,
            use_freq=use_freq,
            max_leaf_size=max_leaf_size,
            imbalanced_ratio=imbalanced_ratio,
            imbalanced_depth=imbalanced_depth,
            spherical=spherical,
            seed=seed,
            max_iter=max_iter,
            threads=threads,
        )
        cluster_chain = cluster_util.ClusterChain(cluster_chain)
        return cluster_chain
コード例 #3
0
ファイル: indices.py プロジェクト: OctoberChang/pecos
def _extend_to_depth(chain, depth):
    """Extends a cluster chain to a given depth."""
    if depth < len(chain):
        return chain
    num_req = depth - len(chain)
    num_codes = chain[-1].shape[1]
    new_chain = chain[0:len(chain) - 1]
    for i in range(num_req):
        new_chain.append(smat.identity(num_codes, dtype=np.float32).tocsc())
    new_chain.append(chain[-1])
    return cluster_util.ClusterChain(new_chain)
コード例 #4
0
ファイル: indices.py プロジェクト: OctoberChang/pecos
    def gen(
        cls,
        feat_mat,
        label_strs=[],
        depth=1,
        spherical=True,
        max_iter=20,
        max_leaf_size=100,
        seed=0,
        **kwargs,
    ):

        # try:
        is_sorted = all(label_strs[i] <= label_strs[i + 1]
                        for i in range(len(label_strs) - 1))
        if not is_sorted:
            raise Exception(
                "label_strs should be sorted in order to build a cluster matrices correctly. "
                "If not sorted then rows in last matrix in cluster chain will not correspond to "
                "columns in training- data label matrix ")

        LOGGER.info("Starting Hybrid-Trie Indexing")
        trie = TrieWrapper()
        trie.update({lstr: 1 for lstr in label_strs})
        LOGGER.info(
            "Added all labels to trie. Now building trie till depth = {}".
            format(depth))

        trie_chain = trie.build_cluster_chain(depth=depth)
        flat_clust = smat.csc_matrix(
            trie_chain.chain[-1]
        )  # Use last mat in chain to define flat clustering

        LOGGER.info("Flat clust shape :{}".format(flat_clust.shape))
        remaining_chain = PreClusteredHierarchicalKMeans.gen(
            feat_mat=feat_mat,
            init_mat=flat_clust,
            hierarchical_codes=False,
            spherical=spherical,
            max_leaf_size=max_leaf_size,
            max_iter=max_iter,
            seed=seed,
        )

        LOGGER.info("Built remaining cluster chain using HC 2-means :".format(
            flat_clust.shape))
        final_chain = cluster_util.ClusterChain(trie_chain.chain[:-1] +
                                                remaining_chain.chain[1:])
        return final_chain
コード例 #5
0
ファイル: indices.py プロジェクト: OctoberChang/pecos
    def build_cluster_chain(self, depth):

        cluster_chain = self._build_sparse_cluster_chain_helper(depth=depth)

        assert len(cluster_chain) == depth + 1
        # Merge all child cluster chains level wise
        new_chain = []
        for curr_level in range(depth + 1):
            mats_to_merge = cluster_chain[curr_level]

            row_offsets = np.concatenate(
                ([0], np.cumsum([mat.shape[0] for mat in mats_to_merge])))
            col_offsets = np.concatenate(
                ([0], np.cumsum([mat.shape[1] for mat in mats_to_merge])))
            total_n_rows = row_offsets[-1]
            total_n_cols = col_offsets[-1]

            all_rows = [
                mat.row + row_offset
                for mat, row_offset in zip(mats_to_merge, row_offsets)
            ]
            all_cols = [
                mat.col + col_offset
                for mat, col_offset in zip(mats_to_merge, col_offsets)
            ]

            new_row_idxs = np.concatenate(all_rows)
            new_col_idxs = np.concatenate(all_cols)

            assert len(new_row_idxs) == len(new_col_idxs)
            new_data = np.ones((len(new_row_idxs)))

            new_mat = smat.csr_matrix(
                (new_data, (new_row_idxs, new_col_idxs)),
                shape=(total_n_rows, total_n_cols),
                dtype=sp.float32,
            )
            new_chain.append(new_mat)

        cluster_chain = cluster_util.ClusterChain(new_chain)
        return cluster_chain
コード例 #6
0
ファイル: indices.py プロジェクト: OctoberChang/pecos
    def gen(
        cls,
        feat_mat,
        init_mat,
        kdim=2,
        max_leaf_size=100,
        spherical=True,
        seed=0,
        max_iter=20,
        threads=-1,
        hierarchical_codes=True,
        **kwargs,
    ):
        """Main clustering function.

        Parameters:
        ----------
        feat_mat: smat.csr_matrix
            label features to be used for clustering.
        init_mat: smat.csc_matrix
            initial pre-clustering as sparse matrix, rows are labels and columns are codes.
        kdim: int
            number of children for each parent node
        spherical: bool
            true: use spherical kmeans
            false: use regular kmeans
        seed: int
            random seed
        max_iter: int
            max. iterations for clustering
        threads: int
            number of cores to be used
        hierarchical_codes: bool
            true: make a hierarchical tree from roots to codes
            false: root branches into codes directly

        Returns:
        -------
        cluster_chain representing the final clustering.

        """
        if not isinstance(feat_mat, smat.csr_matrix):
            raise ValueError("feat_mat does not follow correct input format")
        if feat_mat.dtype != np.float32:
            raise ValueError("feat_mat does not follow correct data type")
        if not isinstance(init_mat, smat.csc_matrix):
            raise ValueError("init_mat does not follow correct input format")
        if init_mat.dtype != np.float32:
            raise ValueError("init_mat does not follow correct data type")
        label_order = []
        all_cluster_chains = []
        for code in range(init_mat.shape[1]):
            LOGGER.info(
                "Training hierarchical clustering for code: {}".format(code))
            rel_labels = init_mat.indices[init_mat.indptr[code]:init_mat.
                                          indptr[code + 1]]
            rel_feat = feat_mat[rel_labels, :]
            all_cluster_chains.append(cls.indexer_dict["hierarchicalkmeans"].gen(
                feat_mat=rel_feat,
                kdim=kdim,
                max_leaf_size=max_leaf_size,
                imbalanced_ratio=
                0.0000000000000000001,  # Passing non-zero but very very small imbalanced ratio to avoid error thrown by PECOS package when a branch has just a single label.
                spherical=spherical,
                seed=seed,
                threads=threads,
                max_iter=max_iter,
            ))
            label_order += list(rel_labels)

        if hierarchical_codes:
            final_cluster_chain = _index_clusters(feat_mat, init_mat)
        else:
            final_cluster_chain = [
                smat.csc_matrix(
                    np.ones((init_mat.shape[1], 1)),
                    dtype=np.float32,
                )
            ]
        max_depth = max(len(c_chain) for c_chain in all_cluster_chains)
        all_cluster_chains = [
            _extend_to_depth(c_chain, max_depth)
            for c_chain in all_cluster_chains
        ]
        for d in range(max_depth):
            LOGGER.info("Joining matrices at depth {}".format(d))
            mat_list = [
                all_cluster_chains[cluster][d]
                for cluster in range(len(all_cluster_chains))
            ]
            final_cluster_chain.append(_block_join(mat_list))
        inverse = [0] * len(label_order)
        for i, p in enumerate(label_order):
            inverse[p] = i
        final_cluster_chain[-1] = final_cluster_chain[-1].tocsr()[
            inverse, :].tocsc()
        return cluster_util.ClusterChain(final_cluster_chain)