def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))
def awesome_cossim_top(A, B, ntop, lower_bound=0):

    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))
Beispiel #3
0
def awesome_cossim_topn(A,
                        B,
                        ntop,
                        lower_bound=0,
                        use_threads=False,
                        n_jobs=1):
    """
    This function will return a matrxi C in CSR format, where
    C = [sorted top n results and results > lower_bound for each row of A * B]

    Input:
        A and B: two CSR matrix
        ntop: n top results
        lower_bound: a threshold that the element of A*B must greater than
        use_threads: use multi-thread or not
        n_jobs: number of thread, must be >= 1

    Output:
        C: result matrix

    N.B. if A and B are not CSR format, they will be converted to CSR
    """
    if not isspmatrix_csr(A):
        A = A.tocsr()

    if not isspmatrix_csr(B):
        B = B.tocsr()

    M, K1 = A.shape
    K2, N = B.shape

    idx_dtype = np.int32

    nnz_max = M * ntop

    indptr = np.empty(M + 1, dtype=idx_dtype)
    indices = np.empty(nnz_max, dtype=idx_dtype)
    data = np.empty(nnz_max, dtype=A.dtype)

    if not use_threads:

        ct.sparse_dot_topn(M, N, np.asarray(A.indptr, dtype=idx_dtype),
                           np.asarray(A.indices, dtype=idx_dtype), A.data,
                           np.asarray(B.indptr, dtype=idx_dtype),
                           np.asarray(B.indices, dtype=idx_dtype), B.data,
                           ntop, lower_bound, indptr, indices, data)

    else:
        if n_jobs < 1:
            err_str = 'You select the multi-thread mode and n_job must be a value greater equal than 1!'
            raise ValueError(err_str)

        ct_thread.sparse_dot_topn_threaded(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype), A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype), B.data, ntop, lower_bound,
            indptr, indices, data, n_jobs)

    return csr_matrix((data, indices, indptr), shape=(M, N))
Beispiel #4
0
def cossim_top(A, B, ntop, lower_bound=0):
    try:
        import sparse_dot_topn.sparse_dot_topn as ct
    except ModuleNotFoundError:
        print("This module requires the sparse_dot_topn library \
        accelerated sparse matrix multiplication,which can be found \
        at https://github.com/ing-bank/sparse_dot_topn.")
        import sys
        sys.exit(1)
    B = B.tocsr()

    M, _ = A.shape
    _, N = B.shape

    idx_dtype = np.int32

    nnz_max = M * ntop

    indptr = np.empty(M + 1, dtype=idx_dtype)
    indices = np.empty(nnz_max, dtype=idx_dtype)
    data = np.empty(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(M, N, np.asarray(A.indptr, dtype=idx_dtype),
                       np.asarray(A.indices, dtype=idx_dtype), A.data,
                       np.asarray(B.indptr, dtype=idx_dtype),
                       np.asarray(B.indices, dtype=idx_dtype), B.data, ntop,
                       lower_bound, indptr, indices, data)

    return csr_matrix((data, indices, indptr), shape=(M, N))
    def _awesome_cossim_top(self, ntop, lower_bound):
        ''' https://gist.github.com/ymwdalex/5c363ddc1af447a9ff0b58ba14828fd6#file-awesome_sparse_dot_top-py '''
        # To CSR Matrix, if needed
        A = self.tfidf_vect.fit_transform(self.source_names).tocsr()
        B = self.tfidf_vect.fit_transform(self.target_names).transpose().tocsr()
        M, _ = A.shape
        _, N = B.shape

        idx_dtype = np.int32

        nnz_max = M * ntop

        indptr = np.zeros(M+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)

        ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)

        self.sprse_mtx = csr_matrix((data,indices,indptr), shape=(M,N))
def awesome_cossim_top(A, B, ntop, pFromDir, pToDir, lower_bound=0):
    try:
        # force A and B as a CSR matrix.
        # If they have already been CSR, there is no overhead
        A = A.tocsr()
        B = B.tocsr()
        M, _ = A.shape
        _, N = B.shape
     
        idx_dtype = np.int32
     
        nnz_max = M*ntop
     
        indptr = np.zeros(M+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)

        ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)
    except Exception as e:
        print('*** ERROR[001]: Error in similarity calculating matrix: ', sys.exc_info()[0],str(e))
        print(traceback.format_exc())
        utils.movefile(pFromDir, pToDir)
        return(-1)

    return csr_matrix((data,indices,indptr),shape=(M,N))
Beispiel #7
0
    def _awesome_cossim_top(self, A, B, ntop):
        """
        True magic, an improvment on scikit-learn's cosine_similarity function.
        Thanks to ING BANK.

        ING definition:
            This function will return a matrix C in CSR format, where
            C = [sorted top n results and results > lower_bound for each row of A * B]
                Input:
                    A and B: two CSR matrix
                    ntop: n top results
                    self.lowest_similarity: a threshold that the element of A*B must greater than
                Output:
                    C: result matrix
            N.B. if A and B are not CSR format, they will be converted to CSR

        """
        if not isspmatrix_csr(A):
            A = A.tocsr()

        if not isspmatrix_csr(B):
            B = B.tocsr()

        M, K1 = A.shape
        K2, N = B.shape

        if K1 != K2:
            err_str = 'A matrix multiplication will be operated. A.shape[1] must be equal to B.shape[0]!'
            raise ValueError(err_str)

        idx_dtype = np.int32

        nnz_max = M * ntop

        # basic check. if A or B are all zeros matrix, return all zero matrix directly
        if len(A.indices) == 0 or len(B.indices) == 0:
            indptr = np.zeros(M + 1, dtype=idx_dtype)
            indices = np.zeros(nnz_max, dtype=idx_dtype)
            data = np.zeros(nnz_max, dtype=A.dtype)
            return csr_matrix((data, indices, indptr), shape=(M, N))

        # filled matrices from here on
        indptr = np.zeros(M + 1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)

        ct.sparse_dot_topn(M, N, np.asarray(A.indptr, dtype=idx_dtype),
                           np.asarray(A.indices, dtype=idx_dtype), A.data,
                           np.asarray(B.indptr, dtype=idx_dtype),
                           np.asarray(B.indices, dtype=idx_dtype), B.data,
                           ntop, self.lowest_similarity, indptr, indices, data)

        return csr_matrix((data, indices, indptr), shape=(M, N))
def awesome_cossim_topn(A, B, ntop, lower_bound=0):
    """
    This function will return a matrxi C in CSR format, where
    C = [sorted top n results and results > lower_bound for each row of A * B]

    Input:
        A and B: two CSR matrix
        ntop: n top results
        lower_bound: a threshold that the element of A*B must greater than

    Output:
        C: result matrix

    N.B. if A and B are not CSR format, they will be converted to CSR
    """
    if not isspmatrix_csr(A):
        A = A.tocsr()
    
    if not isspmatrix_csr(B):
        B = B.tocsr()

    M, K1 = A.shape
    K2, N = B.shape

    idx_dtype = np.int32

    nnz_max = M*ntop

    indptr = np.empty(M+1, dtype=idx_dtype)
    indices = np.empty(nnz_max, dtype=idx_dtype)
    data = np.empty(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    """
    Evaluates similarity score between two groups of strings (as matrix) with cosine similarity and
    prints ntop highest values per string

    Parameters
    ----------
    A,B : matrix
        matrix representation of strings to compare
    ntop : int
        Number of coincidences wanted printed in results

    Returns
    -------
    csr matrix
        a sparse matrix with the ntop highest coincidences
    """

    # force A and B as a CSR matrix.
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape

    idx_dtype = np.int32

    nnz_max = M * ntop

    indptr = np.zeros(M + 1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(M, N, np.asarray(A.indptr, dtype=idx_dtype),
                       np.asarray(A.indices, dtype=idx_dtype), A.data,
                       np.asarray(B.indptr, dtype=idx_dtype),
                       np.asarray(B.indices, dtype=idx_dtype), B.data, ntop,
                       lower_bound, indptr, indices, data)

    return csr_matrix((data, indices, indptr), shape=(M, N))
Beispiel #10
0
def cosimtop(A, B, ntop, lower_bound=0):

    '''
    Optimized cosine similarity computation.
    :param A: First matrix.
    :param B: Second matrix.
    :param ntop: Top n for each row.
    :param lower_bound: Lower bound for each row.
    :return: Cosine similarity matrix.
    '''
    
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
    idx_dtype = np.int32
    nnz_max = M * ntop
    indptr = np.zeros(M + 1, dtype = idx_dtype)
    indices = np.zeros(nnz_max, dtype = idx_dtype)
    data = np.zeros(nnz_max, dtype = A.dtype)
    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype = idx_dtype),
        np.asarray(A.indices, dtype = idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype = idx_dtype),
        np.asarray(B.indices, dtype = idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, 
        indices, 
        data
    )
    return csr_matrix(
        (data, indices, indptr), 
        shape=(M, N)
    )