Ejemplo n.º 1
0
    def kmeans(feat_mat,
               c1=-1,
               c2=-1,
               min_size=50,
               kmeans_max_iter=20,
               spherical=True):
        if c1 == -1:
            c1, c2 = np.random.randint(feat_mat.shape[0]), np.random.randint(
                1, feat_mat.shape[0])
        c1, c2 = feat_mat[c1], feat_mat[(c1 + c2) % feat_mat.shape[0]]
        old_indexer = np.ones(feat_mat.shape[0]) * -1

        for _ in range(kmeans_max_iter):
            scores = np.squeeze(np.asarray(feat_mat.multiply(c1 - c2).sum(1)))
            indexer = scores >= 0
            if indexer.sum() < min_size:
                indexer = np.zeros(feat_mat.shape[0], dtype=np.bool)
                indexer[np.argpartition(-scores, min_size)[:min_size]] = True
            elif (~indexer).sum() < min_size:
                indexer = np.zeros(feat_mat.shape[0], dtype=np.bool)
                indexer[np.argpartition(scores, min_size)[min_size:]] = True
            if np.array_equal(indexer, old_indexer):
                break
            old_indexer = indexer
            c1 = feat_mat[indexer].sum(0)
            c2 = feat_mat[~indexer].sum(0)
            if spherical:
                c1 = sk_normalize(c1)
                c2 = sk_normalize(c2)
        return indexer
Ejemplo n.º 2
0
def test_normalize(
        failure_logger,
        clf_dataset,
        axis,
        norm,  # noqa: F811
        return_norm):
    X_np, X = clf_dataset

    if return_norm:
        t_X, t_norms = cu_normalize(X,
                                    axis=axis,
                                    norm=norm,
                                    return_norm=return_norm)
        sk_t_X, sk_t_norms = sk_normalize(X_np,
                                          axis=axis,
                                          norm=norm,
                                          return_norm=return_norm)
        assert_allclose(t_norms, sk_t_norms)
    else:
        t_X = cu_normalize(X, axis=axis, norm=norm, return_norm=return_norm)
        sk_t_X = sk_normalize(X_np,
                              axis=axis,
                              norm=norm,
                              return_norm=return_norm)

    assert type(t_X) == type(X)
    assert_allclose(t_X, sk_t_X)
Ejemplo n.º 3
0
    def testNormalizeExecution(self):
        raw_dense = np.random.rand(10, 10)
        raw_sparse = sps.random(10, 10, density=0.4, format='csr')

        for chunk_size in [10, 6, (10, 6), (6, 10)]:
            for raw, x in [
                (raw_dense, mt.tensor(raw_dense, chunk_size=chunk_size)),
                (raw_sparse, mt.tensor(raw_sparse, chunk_size=chunk_size))
            ]:
                for norm in ['l1', 'l2', 'max']:
                    for axis in (0, 1):
                        for use_sklearn in [True, False]:
                            n = normalize(x,
                                          norm=norm,
                                          axis=axis,
                                          return_norm=False)
                            n.op._use_sklearn = use_sklearn

                            result = self.executor.execute_tensor(
                                n, concat=True)[0]
                            expected = sk_normalize(raw,
                                                    norm=norm,
                                                    axis=axis,
                                                    return_norm=False)

                            if sps.issparse(expected):
                                expected = expected.A
                            np.testing.assert_almost_equal(
                                np.asarray(result), expected)

        raw_dense = np.random.rand(10, 10)
        raw_sparse = sps.random(10, 10, density=0.4, format='csr')

        # test copy and return_normalize
        for axis in (0, 1):
            for chunk_size in (10, 6, (6, 10)):
                for raw in (raw_dense, raw_sparse):
                    x = mt.tensor(raw, chunk_size=chunk_size)
                    n = normalize(x, axis=axis, copy=False, return_norm=True)

                    results = self.executor.execute_tensors(n)
                    raw_copy = raw.copy()
                    try:
                        expects = sk_normalize(raw_copy,
                                               axis=axis,
                                               copy=False,
                                               return_norm=True)
                    except NotImplementedError:
                        continue

                    if sps.issparse(expects[0]):
                        expected = expects[0].A
                    else:
                        expected = expects[0]
                    np.testing.assert_almost_equal(np.asarray(results[0]),
                                                   expected)
                    np.testing.assert_almost_equal(results[1], expects[1])
Ejemplo n.º 4
0
def test_normalize_execution(setup):
    raw_dense = np.random.rand(10, 10)
    raw_sparse = sps.random(10, 10, density=0.4, format="csr")

    for chunk_size in [10, 6, (10, 6), (6, 10)]:
        for raw, x in [
            (raw_dense, mt.tensor(raw_dense, chunk_size=chunk_size)),
            (raw_sparse, mt.tensor(raw_sparse, chunk_size=chunk_size)),
        ]:
            for norm in ["l1", "l2", "max"]:
                for axis in (0, 1):
                    for use_sklearn in [True, False]:
                        n = normalize(x,
                                      norm=norm,
                                      axis=axis,
                                      return_norm=False)
                        n.op._use_sklearn = use_sklearn

                        result = n.execute().fetch()
                        expected = sk_normalize(raw,
                                                norm=norm,
                                                axis=axis,
                                                return_norm=False)

                        if sps.issparse(expected):
                            expected = expected.A
                        np.testing.assert_almost_equal(np.asarray(result),
                                                       expected)

    raw_dense = np.random.rand(10, 10)
    raw_sparse = sps.random(10, 10, density=0.4, format="csr")

    # test copy and return_normalize
    for axis in (0, 1):
        for chunk_size in (10, 6, (6, 10)):
            for raw in (raw_dense, raw_sparse):
                x = mt.tensor(raw, chunk_size=chunk_size)
                n = normalize(x, axis=axis, copy=False, return_norm=True)

                results = n.execute().fetch()
                raw_copy = raw.copy()
                try:
                    expects = sk_normalize(raw_copy,
                                           axis=axis,
                                           copy=False,
                                           return_norm=True)
                except NotImplementedError:
                    continue

                if sps.issparse(expects[0]):
                    expected = expects[0].A
                else:
                    expected = expects[0]
                np.testing.assert_almost_equal(np.asarray(results[0]),
                                               expected)
                np.testing.assert_almost_equal(results[1], expects[1])
Ejemplo n.º 5
0
def load_feature_matrix(args):
    if args.feature_format % 3 == 0:
        X1 = HierarchicalMLModel.load_feature_matrix(args.input_inst_feat1)
        X2 = HierarchicalMLModel.load_feature_matrix(args.input_inst_feat2)
        X = smat.hstack([sk_normalize(X1, axis=1),
                         sk_normalize(X2, axis=1)]).tocsr()
    elif args.feature_format % 3 == 1 and args.input_inst_feat1:
        X = HierarchicalMLModel.load_feature_matrix(args.input_inst_feat1)
    elif args.feature_format % 3 == 2 and args.input_inst_feat2:
        X = HierarchicalMLModel.load_feature_matrix(args.input_inst_feat2)
    else:
        raise NotImplementedError(
            f"args.feature_format = {args.feature_format} is not supported.")
    if args.feature_format // 3 == 0:
        X = sk_normalize(X, axis=1, copy=False)
    return X
Ejemplo n.º 6
0
 def __init__(self, code_to_label):
     assert isinstance(code_to_label, smat.spmatrix)
     code_to_label = code_to_label.tocsr()
     self.code_to_label = sk_normalize(code_to_label,
                                       axis=1,
                                       copy=False,
                                       norm='l1')
Ejemplo n.º 7
0
    def predict_new(self,
                    X,
                    only_topk=None,
                    csr_codes=None,
                    beam_size=2,
                    max_depth=None,
                    cond_prob=True,
                    normalized=False,
                    threads=-1):
        if max_depth is None:
            max_depth = self.depth
        if cond_prob is None or cond_prob == False:
            cond_prob = PostProcessor(Transform.identity, Combiner.noop)
        if cond_prob == True:
            cond_prob = PostProcessor(Transform.get_lpsvm(3), Combiner.mul)
        assert isinstance(cond_prob, PostProcessor), tpye(cond_prob)

        assert X.shape[1] == self.nr_features
        if self.bias > 0:
            X = smat_util.append_column(X, self.bias)
        pX = PyMatrix.init_from(X, dtype=self.model_chain[0].pW.dtype)
        max_depth = min(self.depth, max_depth)
        pred_csr = csr_codes
        for d in range(max_depth):
            cur_model = self.model_chain[d]
            local_only_topk = only_topk if d == (max_depth - 1) else beam_size
            pred_csr = cur_model.predict_new(pX,
                                             only_topk=local_only_topk,
                                             csr_codes=pred_csr,
                                             cond_prob=cond_prob,
                                             threads=threads)
        if normalized:
            pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm='l1')
        return pred_csr
Ejemplo n.º 8
0
    def predict(
        self,
        X,
        csr_codes=None,
        only_topk=None,
        cond_prob=True,
        normalize=False,
        **arg_kw,
    ):
        assert csr_codes is not None, "csr_codes must be provided for CountModel.prdict)"
        assert csr_codes.shape[0] == X.shape[0]
        assert csr_codes.shape[1] == self.nr_codes
        if cond_prob:
            pred_csr = csr_codes.dot(self.code_to_label).tocsr()
        else:
            tmp = csr_codes.data
            tmp2 = sp.ones_like(tmp)
            csr_codes.data = tmp2
            pred_csr = csr_codes.dot(self.code_to_label).tocsr()
            csr_codes.data = tmp

        pred_csr = smat_util.sorted_csr(pred_csr, only_topk=only_topk)
        if normalize:
            pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm="l1")
        return pred_csr
Ejemplo n.º 9
0
def test_inplace_csr_row_normalize_l2(failure_logger, sparse_random_dataset):
    X_np, _, _, X_sparse = sparse_random_dataset
    if X_sparse.format != 'csr':
        pytest.skip('Skip non CSR matrices')

    inplace_csr_row_normalize_l2(X_sparse)
    X_np = sk_normalize(X_np, norm='l2', axis=1)
    assert_allclose(X_sparse, X_np)
Ejemplo n.º 10
0
def orthogonalize_gram_schmidt(kmat):
    """Orthogonalize filters using gram_schmidt method. kmat should be num_params x num_filts"""

    num_par, num_filts = kmat.shape
    # First normalize all filters
    kmat_out = sk_normalize(kmat, axis=0)

    for nn in range(num_filts - 1):

        # orthogonalize all filters to the chosen
        for mm in range(nn + 1, num_filts):
            kmat_out[:, mm] = kmat_out[:, mm] - np.dot(
                kmat_out[:, nn], kmat_out[:, mm]) * kmat_out[:, nn]

        # renoramlize
        kmat_out = sk_normalize(kmat_out, axis=0)

    return kmat_out
Ejemplo n.º 11
0
    def predict(
        self,
        X,
        only_topk=None,
        csr_codes=None,
        cond_prob=None,
        normalized=False,
        threads=-1,
    ):
        assert X.shape[1] == self.nr_features
        if csr_codes is None:
            dense = X.dot(self.W).toarray()
            if cond_prob:
                dense = cond_prob.transform(dense, inplace=True)
            coo = smat_util.dense_to_coo(dense)
            pred_csr = smat_util.sorted_csr_from_coo(coo.shape,
                                                     coo.row,
                                                     coo.col,
                                                     coo.data,
                                                     only_topk=only_topk)
        else:  # csr_codes is given
            assert self.C is not None, "This model does not have C"
            assert X.shape[1] == self.nr_features
            assert csr_codes.shape[0] == X.shape[0]
            assert csr_codes.shape[1] == self.nr_codes
            if (csr_codes.data == 0).sum() != 0:
                # this is a trick to avoid zero entries explicit removal from the smat_dot_smat
                offset = sp.absolute(csr_codes.data).max() + 1
                csr_codes = smat.csr_matrix(
                    (csr_codes.data + offset, csr_codes.indices,
                     csr_codes.indptr),
                    shape=csr_codes.shape,
                )
                csr_labels = (csr_codes.dot(self.C.T)).tocsr()
                csr_labels.data -= offset
            else:
                csr_labels = (csr_codes.dot(self.C.T)).tocsr()
            nnz_of_insts = csr_labels.indptr[1:] - csr_labels.indptr[:-1]
            inst_idx = sp.repeat(sp.arange(X.shape[0], dtype=sp.uint32),
                                 nnz_of_insts)
            label_idx = csr_labels.indices.astype(sp.uint32)
            val = self.predict_values(X, inst_idx, label_idx, threads=threads)
            if cond_prob:
                val = cond_prob.transform(val, inplace=True)
                val = cond_prob.combiner(val, csr_labels.data)

            pred_csr = smat_util.sorted_csr_from_coo(csr_labels.shape,
                                                     inst_idx,
                                                     label_idx,
                                                     val,
                                                     only_topk=only_topk)

        if normalized:
            pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm="l1")
        return pred_csr
Ejemplo n.º 12
0
def test_normalize_sparse(sparse_clf_dataset, norm):  # noqa: F811
    X_np, X = sparse_clf_dataset

    axis = 0 if X.format == 'csc' else 1

    t_X = cu_normalize(X, axis=axis, norm=norm)
    assert type(t_X) == type(X)

    sk_t_X = sk_normalize(X_np, axis=axis, norm=norm)

    assert_allclose(t_X, sk_t_X)
Ejemplo n.º 13
0
def normalize(data):
    """
    Normalize the data to the [-1,1] range.

    Arguments:
        data (np.array): Data to normalize.
    
    Todo:
        Revise the outputs.
    """
    
    return sk_normalize(data, norm='max',axis=0,return_norm=True)
Ejemplo n.º 14
0
    def predict_new(
        self,
        X,
        only_topk=None,
        csr_codes=None,
        cond_prob=None,
        normalized=False,
        threads=-1,
    ):
        assert X.shape[1] == self.nr_features
        if csr_codes is None:
            dense = X.dot(self.W).toarray()
            if cond_prob:
                dense = cond_prob.transform(dense, inplace=True)
            coo = smat_util.dense_to_coo(dense)
            pred_csr = smat_util.sorted_csr_from_coo(coo.shape,
                                                     coo.row,
                                                     coo.col,
                                                     coo.data,
                                                     only_topk=only_topk)
        else:  # csr_codes is given
            assert self.C is not None, "This model does not have C"
            assert X.shape[1] == self.nr_features
            assert csr_codes.shape[0] == X.shape[0]
            assert csr_codes.shape[1] == self.nr_codes
            if not csr_codes.has_sorted_indices:
                csr_codes = csr_codes.sorted_indices()
            if (csr_codes.data == 0).sum() != 0:
                # this is a trick to avoid zero entries explicit removal from the smat_dot_smat
                offset = sp.absolute(csr_codes.data).max() + 1
                csr_codes = smat.csr_matrix(
                    (csr_codes.data + offset, csr_codes.indices,
                     csr_codes.indptr),
                    shape=csr_codes.shape,
                )
                pZ = PyMatrix.init_from(csr_codes, self.dtype)
                csr_labels, pred_csr = clib.multilabel_predict_with_codes(
                    X, self.pW, self.pC, pZ, threads=threads)
                csr_labels.data -= offset
            else:
                pZ = PyMatrix.init_from(csr_codes.sorted_indices(), self.dtype)
                csr_labels, pred_csr = clib.multilabel_predict_with_codes(
                    X, self.pW, self.pC, pZ, threads=threads)
            val = pred_csr.data
            if cond_prob:
                val = cond_prob.transform(val, inplace=True)
                val = cond_prob.combiner(val, csr_labels.data)

            pred_csr = smat_util.sorted_csr(pred_csr, only_topk=only_topk)

        if normalized:
            pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm="l1")
        return pred_csr
Ejemplo n.º 15
0
def normalize(X, norm='l2', copy=False):
    """Normalize sparse or dense matrix
    Arguments:
    ---------
    X: csr_matrix or csc_matrix
        sparse matrix
    norm: str, optional, default='l2'
        normalize with l1/l2
    copy: boolean, optional, default=False
        whether to copy data or not
    """
    return sk_normalize(X, norm=norm, copy=copy)
Ejemplo n.º 16
0
def test_inplace_csr_row_normalize_l2(sparse_clf_dataset):  # noqa: F811
    X_np, X = sparse_clf_dataset

    if not cp.sparse.issparse(X):
        pytest.skip("Skipping non-CuPy or non-sparse arrays")

    if X.format != 'csr':
        X = X.tocsr()

    inplace_csr_row_normalize_l2(X)

    X_np = X_np.toarray()
    X_np = sk_normalize(X_np, norm='l2', axis=1)

    assert_allclose(X, X_np)
Ejemplo n.º 17
0
def main(args):
    # set hyper-parameters
    input_feat_path = args.input_feat_path
    depth = args.depth
    kdim = args.kdim
    algo = args.algo
    seed = args.seed
    verbose = args.verbose
    max_iter = args.max_iter
    threads = args.threads
    output_code_dir = args.output_code_dir
    if verbose:
        print("depth {} kdim {} algo {}".format(depth, kdim, algo))

    # load label feature matrix (nr_labels * nr_features)
    if path.exists(input_feat_path):
        feat_mat = load_feature_matrix(input_feat_path)
    else:
        raise ValueError(
            "label embedding path does not exist {}".format(input_feat_path))

    if not path.exists(output_code_dir):
        os.makedirs(output_code_dir, exist_ok=True)

    # Indexing algorithm
    # C: nr_labels x nr_codes, stored in csr sparse matrix
    indexer = Indexer(feat_mat)
    if algo == indexer.SKMEANS:
        feat_mat = sk_normalize(feat_mat, axis=1, norm="l2", copy=False)

    code = indexer.gen(kdim=kdim,
                       depth=depth,
                       algo=algo,
                       seed=seed,
                       max_iter=max_iter,
                       threads=threads)
    if verbose:
        code.print()
    C = code.get_csc_matrix()
    if verbose:
        print("C", C.shape)

    # save code and args
    output_code_path = path.join(output_code_dir, "code.npz")
    smat.save_npz("{}".format(output_code_path), C, compressed=False)
    output_config_path = path.join(output_code_dir, "config.json")
    with open(output_config_path, "w") as fout:
        fout.write(json.dumps(vars(args), indent=True))
Ejemplo n.º 18
0
def test_normalize_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
                          norm):
    X_np, X = sparse_clf_dataset

    axis = 0 if X.format == 'csc' else 1

    t_X = cu_normalize(X, axis=axis, norm=norm)
    #  assert type(t_X) == type(X)
    if cpx.scipy.sparse.issparse(X):
        assert cpx.scipy.sparse.issparse(t_X)
    if scipy.sparse.issparse(X):
        assert scipy.sparse.issparse(t_X)

    sk_t_X = sk_normalize(X_np, axis=axis, norm=norm)

    assert_allclose(t_X, sk_t_X)
Ejemplo n.º 19
0
    def predict_new(self,
                    X,
                    only_topk=None,
                    csr_codes=None,
                    beam_size=2,
                    max_depth=None,
                    cond_prob=True,
                    normalized=False,
                    threads=-1):
        if max_depth is None:
            max_depth = self.depth
        if cond_prob is None or cond_prob == False:
            cond_prob = PostProcessor(Transform.identity, Combiner.noop)
        if cond_prob == True:
            cond_prob = PostProcessor(Transform.get_lpsvm(3), Combiner.mul)
        assert isinstance(cond_prob, PostProcessor), tpye(cond_prob)

        pX = PyMatrix.init_from(X, dtype=self.model_chain[0].pW.dtype)
        max_depth = min(self.depth, max_depth)
        transform = cond_prob.transform if cond_prob else Transform.identity
        pred_csr = csr_codes
        #timer = WallTimer()
        for d in range(max_depth):
            '''
            print('predict at depth {}'.format(d))
            sys.stdout.flush()
            timer.tic()
            '''
            cur_model = self.model_chain[d]
            local_only_topk = only_topk if d == (max_depth - 1) else beam_size
            pred_csr = cur_model.predict_new(pX,
                                             only_topk=local_only_topk,
                                             csr_codes=pred_csr,
                                             transform=transform,
                                             cond_prob=cond_prob,
                                             threads=threads)
            '''
            print('>>> {}ms'.format(timer.toc()))
            sys.stdout.flush()
            '''
            #if cond_prob and normalized: # perform normalization to avoid numerical issue
            #    pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm='l1')
            #print('d = {} codes:{} nnz:{}'.format(d, pred_csr.shape[1], pred_csr.nnz))
        #pred_csr.data[:] = sp.exp(pred_csr.data[:])
        if normalized:
            pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm='l1')
        return pred_csr
Ejemplo n.º 20
0
    def predict(self,
                X,
                only_topk=None,
                transform=None,
                csr_codes=None,
                cond_prob=None,
                normalized=False,
                threads=-1):
        assert X.shape[1] == self.nr_features
        if csr_codes is None:
            dense = X.dot(self.W).toarray()
            if transform:
                dense = transform(dense, inplace=True)
            coo = smat_util.dense_to_coo(dense)
            pred_csr = smat_util.sorted_csr_from_coo(coo.shape,
                                                     coo.row,
                                                     coo.col,
                                                     coo.data,
                                                     only_topk=only_topk)
        else:  # csr_codes is given
            assert self.C is not None, "This model does not have C"
            assert X.shape[1] == self.nr_features
            assert csr_codes.shape[0] == X.shape[0]
            assert csr_codes.shape[1] == self.nr_codes
            csr_labels = (csr_codes.dot(self.C.T)).tocsr()
            nnz_of_insts = csr_labels.indptr[1:] - csr_labels.indptr[:-1]
            inst_idx = sp.repeat(sp.arange(X.shape[0], dtype=sp.uint32),
                                 nnz_of_insts)
            label_idx = csr_labels.indices.astype(sp.uint32)
            val = self.predict_values(X, inst_idx, label_idx, threads=threads)
            if transform:
                val = transform(val, inplace=True)
            if cond_prob:
                val[:] = cond_prob.combiner(val, csr_labels.data)

            pred_csr = smat.csr_matrix((val, label_idx, csr_labels.indptr),
                                       shape=csr_labels.shape)
            pred_csr = smat_util.sorted_csr(pred_csr, only_topk=only_topk)
            #pred_csr = self.predict_with_coo_labels(X, coo_labels.row, coo_labels.cols, only_topk)

        if normalized:
            pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm='l1')
        return pred_csr
Ejemplo n.º 21
0
    def generate_relevance_chain(self, R_dict, norm_type=None, induce=True):
        """Generate a chain of instance to cluster relevance matrix for cost sensitive learning from partial relevance chain.

        Args:
            R_dict (dict): dictionary of partial relevance chains, with keys being number of layers above leaf elements.
                R_dict[i].shape[0] == nr_inst, for all i.
                R_dict[0].shape[1] == self.chain[-1].shape[0],
                R_dict[i].shape[1] == self.chain[-i].shape[1], for i >= 1
                R_dict.keys() \\subset range(len(self.chain)+1)
            norm_type (str, optional): row wise normalziation of resulting relevance matrices. Defatult None to ignore.
                Options: ‘l1’, ‘l2’, ‘max’, 'no-norm', None
            induce (bool, optional): whether to induce missing relevance matrix by label aggregation. Default True

        Returns:
            relevance_chain: list of csc matrices for relevance
        """

        relevance_chain = [None] * (len(self) + 1)
        # if nothing is given, return a chain of None
        if R_dict is None or all(R_dict[x] is None for x in R_dict):
            return relevance_chain

        self.matrix_chain_dimension_check(R_dict)

        # construct relevance chain from incomplete chain
        relevance_chain[0] = R_dict.get(0, None)
        for i in range(1, len(self) + 1):
            if R_dict.get(i, None) is not None:
                relevance_chain[i] = R_dict[i]
            elif relevance_chain[i - 1] is not None and induce:
                relevance_chain[i] = clib.sparse_matmul(
                    relevance_chain[i - 1], self.chain[-i])
            else:
                relevance_chain[i] = None
        relevance_chain.reverse()

        if norm_type not in [None, "no-norm"]:
            relevance_chain = [
                sk_normalize(rr.tocsr(), norm=norm_type)
                if rr is not None else None for rr in relevance_chain
            ]

        return relevance_chain[1:]
Ejemplo n.º 22
0
    def load_indexed_code(code_path, label_feat):
        C = None
        mapping = {
            "none": Indexer.SKMEANS,
            "skmeans": Indexer.SKMEANS,
            "kmeans": Indexer.KMEANS,
            "kdtree": Indexer.KDTREE,
            "random": Indexer.PURE_RANDOM,
            "ordinal": Indexer.BALANCED_ORDINAL,
            "uniform": Indexer.UNIFORM,
        }
        if code_path is None:
            code_path = "none"

        if code_path.lower() in mapping:
            if label_feat is not None:
                algo = mapping[code_path.lower()]
                if algo == Indexer.SKMEANS:
                    label_feat = sk_normalize(label_feat,
                                              axis=1,
                                              norm="l2",
                                              copy=False)
                indexer = Indexer(label_feat)
                code = indexer.gen(
                    kdim=2,
                    depth=indexer.estimate_depth_with_cluster_size(100),
                    algo=algo,
                    seed=0,
                    max_iter=20,
                    threads=1,
                )
                C = code.get_csc_matrix()
        else:
            if code_path.endswith(".npz") and path.exists(code_path):
                C = smat.load_npz(code_path)
            elif path.isdir(code_path) and path.exists(
                    path.join(code_path, "code.npz")):
                C = smat.load_npz(path.join(code_path, "code.npz"))
            else:
                assert False, f"'{code_path}' does not exist. Valid ones {mapping.keys()}"
        return C
Ejemplo n.º 23
0
    def predict_new(self,
                    X,
                    only_topk=None,
                    transform=None,
                    csr_codes=None,
                    cond_prob=None,
                    normalized=False,
                    threads=-1):
        assert X.shape[1] == self.nr_features
        if csr_codes is None:
            dense = X.dot(self.W).toarray()
            if transform:
                dense = transform(dense, inplace=True)
            coo = smat_util.dense_to_coo(dense)
            pred_csr = smat_util.sorted_csr_from_coo(coo.shape,
                                                     coo.row,
                                                     coo.col,
                                                     coo.data,
                                                     only_topk=only_topk)
        else:  # csr_codes is given
            assert self.C is not None, "This model does not have C"
            assert X.shape[1] == self.nr_features
            assert csr_codes.shape[0] == X.shape[0]
            assert csr_codes.shape[1] == self.nr_codes
            pZ = PyMatrix.init_from(csr_codes, self.dtype)
            csr_labels, pred_csr = clib.multilabel_predict_with_codes(
                X, self.pW, self.pC, pZ, threads=threads)
            val = pred_csr.data
            if transform:
                val = transform(val, inplace=True)
            if cond_prob:
                val[:] = cond_prob.combiner(val, csr_labels.data)

            pred_csr = smat_util.sorted_csr(pred_csr, only_topk=only_topk)

        if normalized:
            pred_csr = sk_normalize(pred_csr, axis=1, copy=False, norm='l1')
        return pred_csr
Ejemplo n.º 24
0
 def pifa(Y, X, dtype=sp.float32):
     Y_avg = sk_normalize(Y, axis=1, norm="l2")
     label_embedding = smat.csr_matrix(Y_avg.T.dot(X), dtype=dtype)
     return label_embedding
Ejemplo n.º 25
0
def main():
    # Changing to 25, which will give slightly better intervals, 20 gives very short intervals
    vad_threshold = 25  # threshold for voice activity detection

    # Data prep
    # I'm saving only 2 embeddings i.e. first and last tisv_frames for given interval in an audio. So each .npy
    # embedding file will have a shape of (2, 256)
    tf.reset_default_graph()
    batch_size = 2  # Fixing to 2 since we take 2 for each interval #utter_batch.shape[1]
    verif = tf.placeholder(
        shape=[None, batch_size, 40],
        dtype=tf.float32)  # verification batch (time x batch x n_mel)
    batch = tf.concat([
        verif,
    ], axis=1)
    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [
            tf.contrib.rnn.LSTMCell(num_units=config.hidden,
                                    num_proj=config.proj)
            for i in range(config.num_layer)
        ]
        lstm = tf.contrib.rnn.MultiRNNCell(
            lstm_cells)  # make lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(
            cell=lstm, inputs=batch, dtype=tf.float32,
            time_major=True)  # for TI-VS must use dynamic rnn
        embedded = outputs[-1]  # the last ouput is the embedded d-vector
        embedded = normalize(embedded)  # normalize
    config_tensorflow = tf.ConfigProto(device_count={'GPU': 0})
    saver = tf.train.Saver(var_list=tf.global_variables())
    # Extract embeddings
    # Each embedding saved file will have (2, 256)
    with tf.Session(config=config_tensorflow) as sess:
        tf.global_variables_initializer().run()
        saver.restore(sess, config.model_path)
        logging.info("loading audio")
        audio_path = config.audio_file
        utter, sr = librosa.core.load(audio_path, sr=config.sr)  # load audio
        utter_min_len = (config.tisv_frame_min * config.hop +
                         config.window) * sr  # lower bound of utterance length
        # Get the duration
        duration = librosa.get_duration(utter, sr)
        # Duration of each window
        duration_per_frame = (duration / utter.shape[0])
        logging.info(
            f'Duration: {duration}\nDuration per frame: {duration_per_frame}s\nMin length of utterance: {utter_min_len * duration_per_frame}s'
        )
        tisv_frame_duration_s = utter_min_len * duration_per_frame
        intervals = librosa.effects.split(
            utter, top_db=vad_threshold)  # voice activity detection

        all_data = []
        logging.info('Converting intervals to embeddings')
        selected_intervals_idx = []
        for idx, current_interval in enumerate(intervals):
            if (current_interval[1] - current_interval[0]) > utter_min_len:
                # Save these selected intervals, as shorter ones are ignored
                selected_intervals_idx.append(idx)
                utterances_spec = []
                utter_part = utter[current_interval[0]:current_interval[
                    1]]  # save first and last 160 frames of spectrogram.
                S = librosa.core.stft(y=utter_part,
                                      n_fft=config.nfft,
                                      win_length=int(config.window * sr),
                                      hop_length=int(config.hop * sr))
                S = np.abs(S)**2
                mel_basis = librosa.filters.mel(sr=sr,
                                                n_fft=config.nfft,
                                                n_mels=40)
                S = np.log10(np.dot(mel_basis, S) +
                             1e-6)  # log mel spectrogram of utterances
                utterances_spec.append(S[:, :config.tisv_frame])
                utterances_spec.append(S[:, -config.tisv_frame:])
                utterances_spec = np.array(utterances_spec)
                utter_batch = np.transpose(
                    utterances_spec,
                    axes=(2, 0, 1))  # transpose [frames, batch, n_mels]

                data = sess.run(embedded, feed_dict={verif: utter_batch})
                all_data.extend(data)
    data = np.array(all_data)

    # # Spectral clustering
    # cossine similarity
    similarity = np.dot(data, data.T)
    # squared magnitude of preference vectors (number of occurrences) (diagonals are ai*ai)
    square_mag = np.diag(similarity)
    # inverse squared magnitude
    inv_square_mag = 1 / square_mag
    # if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
    inv_square_mag[np.isinf(inv_square_mag)] = 0
    # inverse of the magnitude
    inv_mag = np.sqrt(inv_square_mag)
    # cosine similarity (elementwise multiply by inverse magnitudes)
    cosine = similarity * inv_mag
    A = cosine.T * inv_mag
    # Fill the diagonals with very large negative value
    np.fill_diagonal(A, -1000)
    # Fill the diagonals with the max of each row
    np.fill_diagonal(A, A.max(axis=1))
    # final step in cossine sim
    A = (1 - A) / 2
    # Gaussian blur
    sigma = 0.5  # we will select sigma as 0.5
    A_gau = gaussian_filter(A, sigma)
    # Thresholding using multiplier = 0.01
    threshold_multiplier = 0.01
    A_thresh = A_gau * threshold_multiplier
    # Symmetrization
    A_sym = np.maximum(A_thresh, A_thresh.T)
    # Diffusion
    A_diffusion = A_sym * A_sym.T
    # Row-wise matrix Normalization
    Row_max = A_diffusion.max(axis=1).reshape(1, A_diffusion.shape[0])
    A_norm = A_diffusion / Row_max.T
    # Eigen decomposition
    eigval, eigvec = np.linalg.eig(A_norm)
    # Since eigen values cannot be negative for Positive semi definite matrix, the numpy returns negative values, converting it to positive
    eigval = np.abs(eigval)
    # reordering eigen values
    sorted_eigval_idx = np.argsort(eigval)[::-1]
    sorted_eigval = np.sort(eigval)[::-1]
    # For division according to the equation
    eigval_shifted = np.roll(sorted_eigval, -1)
    # Thresholding eigen values because we don't need very low eigan values due to errors
    eigval_thresh = 0.1
    sorted_eigval = sorted_eigval[sorted_eigval > eigval_thresh]
    eigval_shifted = eigval_shifted[:sorted_eigval.shape[0]]
    # Don't take the first value for calculations, if first value is large, following equation will return k=1, and we want more than one clusters
    # Get the argmax of the division, since its 0 indexed, add 1
    k = np.argmax(sorted_eigval[1:] / eigval_shifted[1:]) + 2
    logging.debug(f'Number of Eigen vectors to pick: {k}')
    # Get the indexes of eigen vectors
    idexes = sorted_eigval_idx[:k]
    A_eigvec = eigvec[:, idexes]
    A_eigvec = A_eigvec.astype('float32')

    # # K-Means offline clustering
    A_eigvec_norm = sk_normalize(A_eigvec)  # l2 normalized
    kmeans = KMeans(n_clusters=config.number_of_speakers,
                    init='k-means++',
                    random_state=config.random_state)
    kmeans.fit(A_eigvec)
    labels = kmeans.labels_
    output_srt_json = os.path.join(config.output_dir,
                                   os.path.basename(config.srt_path) + '.json')
    output_wav_json = os.path.join(
        config.output_dir,
        os.path.basename(config.srt_path) + '.wav.json')
    OL_INDICATOR = 'OL'
    SIL_INDICATOR = -1
    json_data = []
    for idx, i in enumerate(selected_intervals_idx):
        start = str(
            datetime.timedelta(seconds=intervals[i][0] * duration_per_frame))
        end = str(
            datetime.timedelta(seconds=intervals[i][1] * duration_per_frame))
        speaker = labels[idx * 2]
        if labels[idx * 2] != labels[(idx * 2) + 1]:
            speaker = 'OL'  # possible overlap
        json_data.append({'start': start, 'end': end, 'speaker': str(speaker)})
    # Save the output to json
    with open(output_wav_json, 'w') as f:
        json.dump(json_data, f, indent=4)

    complete_json = {}
    json_data = []
    subs = pysrt.open(config.srt_path, encoding="utf-8")
    convert_to_ms = lambda st: (st.hours * 60 * 60 * 1000) + \
                                (st.minutes * 60 * 1000) +\
                                (st.seconds * 1000) +\
                                (st.milliseconds)
    for sub in subs:
        start_in_ms = convert_to_ms(sub.start)
        end_in_ms = convert_to_ms(sub.end)
        speakers = []
        for idx, i in enumerate(selected_intervals_idx):
            start = intervals[i][0] * duration_per_frame * 1000
            end = intervals[i][1] * duration_per_frame * 1000
            if start_in_ms <= start <= end_in_ms:
                speaker = int(labels[idx * 2])
                if labels[idx * 2] != labels[(idx * 2) + 1]:
                    speaker = OL_INDICATOR  # possible overlap
                speakers.append(speaker)
        json_data.append({
            "index":
            sub.index,
            "start":
            sub.start.to_time().strftime("%H:%M:%S,%f")[:-3],
            "end":
            sub.end.to_time().strftime("%H:%M:%S,%f")[:-3],
            'speakers':
            np.unique(speakers).tolist(),
            'speakers_distribution':
            speakers,
            'text':
            sub.text
        })
    metadata = {
        "overlap_indicator": OL_INDICATOR,
        "duration": duration,
        "class_names": np.unique(labels).tolist(),
        "num_of_speakers": len(set(labels)),
        "silence_indicator": SIL_INDICATOR
    }
    complete_json["metadata"] = metadata
    complete_json["srt"] = json_data
    # Save the output to json
    with open(output_srt_json, 'w') as f:
        json.dump(complete_json, f, indent=4)
Ejemplo n.º 26
0
def spectral_clustering(data):
    # # Spectral clustering
    # cossine similarity
    similarity = np.dot(data, data.T)
    # squared magnitude of preference vectors (number of occurrences) (diagonals are ai*ai)
    square_mag = np.diag(similarity)
    # inverse squared magnitude
    inv_square_mag = 1 / square_mag
    # if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
    inv_square_mag[np.isinf(inv_square_mag)] = 0
    # inverse of the magnitude
    inv_mag = np.sqrt(inv_square_mag)
    # cosine similarity (elementwise multiply by inverse magnitudes)
    cosine = similarity * inv_mag
    A = cosine.T * inv_mag
    # Fill the diagonals with very large negative value
    np.fill_diagonal(A, -1000)
    # Fill the diagonals with the max of each row
    np.fill_diagonal(A, A.max(axis=1))
    # final step in cossine sim
    A = (1-A)/2
    # Gaussian blur
    sigma = 0.5  # we will select sigma as 0.5
    A_gau = gaussian_filter(A, sigma)
    # Thresholding using multiplier = 0.01
    threshold_multiplier = 0.01
    A_thresh = A_gau * threshold_multiplier
    # Symmetrization
    A_sym = np.maximum(A_thresh, A_thresh.T)
    # Diffusion
    A_diffusion = A_sym * A_sym.T
    # Row-wise matrix Normalization
    Row_max = A_diffusion.max(axis=1).reshape(1, A_diffusion.shape[0])
    A_norm = A_diffusion / Row_max.T
    # Eigen decomposition
    eigval, eigvec = np.linalg.eig(A_norm)
    # Since eigen values cannot be negative for Positive semi definite matrix, the numpy returns negative values, converting it to positive
    eigval = np.abs(eigval)
    # reordering eigen values
    sorted_eigval_idx = np.argsort(eigval)[::-1]
    sorted_eigval = np.sort(eigval)[::-1]
    # For division according to the equation
    eigval_shifted = np.roll(sorted_eigval, -1)
    # Thresholding eigen values because we don't need very low eigan values due to errors
    eigval_thresh = 0.1
    sorted_eigval = sorted_eigval[sorted_eigval > eigval_thresh]
    eigval_shifted = eigval_shifted[:sorted_eigval.shape[0]]
    # Don't take the first value for calculations, if first value is large, following equation will return k=1, and we want more than one clusters
    # Get the argmax of the division, since its 0 indexed, add 1
    k = np.argmax(sorted_eigval[1:]/eigval_shifted[1:]) + 2
    print(f'Number of Eigen vectors to pick (clusters): {k}')
    # Get the indexes of eigen vectors
    idexes = sorted_eigval_idx[:k]
    A_eigvec = eigvec[:, idexes]
    A_eigvec = A_eigvec.astype('float32')

    # # K-Means offline clustering
    A_eigvec_norm = sk_normalize(A_eigvec)  # l2 normalized
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=random_state)
    kmeans.fit(A_eigvec)
    labels = kmeans.labels_

    return labels
Ejemplo n.º 27
0
def evaluate_multilabel(model,
                        data,
                        alg=None,
                        classifier="lr",
                        fast=False,
                        ratio=None,
                        cv=10,
                        random_state=None,
                        normalize=False):
    X = []
    Y = []
    for pid in range(len(model.word2id)):
        X.append(model.word_embeddings[pid])

    Y = np.zeros((len(X), len(data.labels)))

    for y, key in enumerate(data.labels.keys()):
        for index, paper in enumerate(data.labels[key]):
            pid = model.word2id[paper]
            Y[pid][y] = 1
    if normalize:
        X = sk_normalize(X)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    df = defaultdict(list)
    if ratio is None:
        ratio = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

    for r in ratio:
        if r <= 0:
            continue
        elif r >= 1:
            break

        if classifier.lower() == 'lr':
            clf = LogisticRegression()
        elif classifier.lower() == "svm":
            clf = SVC(cache_size=5000)
        elif classifier.lower() == "mlp":
            clf = MLPClassifier()
        elif classifier.lower() == "nb":
            clf = GaussianNB()

        micros = []
        macros = []
        for i in range(cv):
            micro, macro = evaluateNodeClassification(
                X, Y, 1 - r, clf=clf, random_state=random_state)
            micros.append(micro)
            macros.append(macro)
        micros = np.mean(micros)
        macros = np.mean(macros)

        df["ratio"].append(r)
        df["micro"].append(micros)
        df["macro"].append(macros)
        #df["alg"].append(alg)
        #df["data"].append(str(data))
        #df["total_samples"].append(model.total_samples)
        #df["negative"].append(model.negative)
        #df["walk_window"].append(model.walk_window)
        #df["walk_probability"].append(model.walk_probability)
        #df["L2"].append(model.l2)

        logging.info("ratio: %.4f : f1_micro %.4f, f1_macro %.4f" %
                     (r, micros, macros))

    if fast:
        return micros, macros
    else:
        return df
Ejemplo n.º 28
0
def evaluate(model,
             data,
             alg=None,
             classifier="lr",
             fast=False,
             ratio=None,
             cv=10,
             normalize=False,
             random_state=None,
             return_y=False):
    X = []
    Y = []
    micros = []
    macros = []
    for y, key in enumerate(data.labels.keys()):
        for index, paper in enumerate(data.labels[key]):
            paper = paper.rstrip()
            if paper not in model.paper2id:
                print("paper not in model: ", paper)
                continue
            X.append(model.paper_embeddings[model.paper2id[paper]])
            Y.append(y)
    print("len X: ", len(X))
    print("len Y: ", len(Y))
    if normalize:
        X = sk_normalize(X)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    clf = LogisticRegression()
    df = defaultdict(list)
    if ratio is None:
        ratio = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    for r in ratio:
        if r <= 0:
            continue
        elif r >= 1:
            break

        micros = []
        macros = []
        for i in range(cv):
            clf = LogisticRegression()
            if classifier.lower() == "svm":
                clf = SVC(cache_size=5000)
            elif classifier.lower() == "mlp":
                clf = MLPClassifier()
            elif classifier.lower() == "nb":
                clf = GaussianNB()

            X_train, X_test, Y_train, Y_test = train_test_split(
                X, Y, test_size=1 - r, random_state=random_state)
            clf.fit(X_train, Y_train)
            prediction = clf.predict(X_test)
            #lpred = clf.predict_proba(X_test)
            #print("prediction shape: ", prediction[0])
            #print("y_test shape: ", Y_test[0])
            #print("Loss: ", log_loss(Y_test,lpred))
            micro = f1_score(Y_test, prediction, average='micro')
            macro = f1_score(Y_test, prediction, average='macro')
            micros.append(micro)
            macros.append(macro)

        #micros = np.mean(micros)
        #macros = np.mean(macros)

        df["ratio"].append(r)
        df["micro"].append(np.mean(micro))
        df["macro"].append(np.mean(macro))
        #df["alg"].append(alg)
        #df["data"].append(str(data))
        #df["total_samples"] = model.total_samples
        #df["negative"].append(model.negative)
        #df["walk_window"].append(model.walk_window)
        #df["walk_probability"].append(model.walk_probability)
        #df["L2"].append(model.l2)
        #logging.info("ratio: %.4f : f1_micro %.4f, f1_macro %.4f" % (r,micros,macros))

    if fast:
        if return_y:
            return micros, macros, Y_test, prediction
        return micros, macros
    else:
        return pd.DataFrame(df)
Ejemplo n.º 29
0
def get_optimal_codes(Y, C, only_topk=None):
    csr_codes = smat_util.sorted_csr(Y.dot(C).tocsr(), only_topk=only_topk)
    csr_codes = sk_normalize(csr_codes, axis=1, copy=False, norm='l1')
    return csr_codes
Ejemplo n.º 30
0
    # For division according to the equation
    eigval_shifted = np.roll(sorted_eigval, -1)
    # Thresholding eigen values because we don't need very low eigan values due to errors
    eigval_thresh = 0.1
    sorted_eigval = sorted_eigval[sorted_eigval > eigval_thresh]
    eigval_shifted = eigval_shifted[:sorted_eigval.shape[0]]
    # Don't take the first value for calculations, if first value is large, following equation will return k=1, and we want more than one clusters
    # Get the argmax of the division, since its 0 indexed, add 1
    k = np.argmax(sorted_eigval[1:] / eigval_shifted[1:]) + 2
    print(f'Number of Eigen vectors to pick: {k}')
    # Get the indexes of eigen vectors
    idexes = sorted_eigval_idx[:k]
    A_eigvec = eigvec[:, idexes]
    np.savetxt(embeddings_path, A_eigvec, delimiter='\t')  # embeddings for viz

    A_eigvec_norm = sk_normalize(A_eigvec)  # l2 normalized
    kmeans = KMeans(n_clusters=number_of_clusters,
                    init='k-means++',
                    random_state=random_state)
    kmeans.fit(A_eigvec)
    labels = kmeans.labels_

    subs = pysrt.open(srt_path, encoding="utf-8")
    convert_to_s = lambda st: (st.hours * 60 * 60) + \
                                (st.minutes * 60) +\
                                (st.seconds) #+ \
    #(st.milliseconds / 1000)
    get_start_and_end = lambda sub: (convert_to_s(sub.start),
                                     convert_to_s(sub.end))

    for sub in subs: