Beispiel #1
0
def cosine_sparse(csr1,
                  csr2,
                  topn=10,
                  min_value=0.4,
                  dense=True,
                  similarity=True,
                  expected_density=1):
    """Computes the cosine distance between the rows of `csr`,
    smaller than the cut-off distance `epsilon`.
    """
    if not dense:
        csr1 = csr_matrix(csr1).astype(bool, copy=False).astype(int,
                                                                copy=False)
        csr1 = csr1.astype('float64', copy=False)
        csr1 = normalize(csr1, norm='l2', axis=1)
        csr2 = csr_matrix(csr2).astype(bool, copy=False).astype(int,
                                                                copy=False)
        csr2 = csr2.astype('float64', copy=False)
        csr2 = normalize(csr2, norm='l2', axis=1)
        intrsct = dot_product(csr1, csr2.T, topn, min_value, expected_density)
        intrsct.data[intrsct.data >= 1] = 1
        if not similarity:
            intrsct.data = 1 - intrsct.data
    else:
        csr1 = csr_matrix(csr1).astype('float64', copy=False)
        csr1 = normalize(csr1, norm='l2', axis=1)
        csr2 = csr_matrix(csr2).astype('float64', copy=False)
        csr2 = normalize(csr2, norm='l2', axis=1)
        intrsct = dot_product(csr1, csr2.T, topn, min_value, expected_density)
        if not similarity:
            intrsct.data = 1 - intrsct.data
    return intrsct
Beispiel #2
0
def pairwise_cosine_sparse_sim(csr1,csr2, topn=4000, min_value = 0.4, word_embedings = True):
    """Computes the cosine distance between the rows of `csr`,
    smaller than the cut-off distance `epsilon`.
    """   
    if not word_embedings:
        csr1 = csr_matrix(csr1).astype(bool,copy=False).astype(int,copy=False)
        csr1 = csr1.astype('float64',copy=False)
        csr1 = normalize(csr1, norm='l2', axis=1)
        csr2 = csr_matrix(csr2).astype(bool,copy=False).astype(int,copy=False)
        csr2 = csr2.astype('float64',copy=False)
        csr2 = normalize(csr2, norm='l2', axis=1)
        intrsct = dot_product(csr1,csr2.T, topn, min_value)
        intrsct.data[intrsct.data>=1] = 1
    else:
        csr1 = csr_matrix(csr1).astype('float64',copy=False)
        csr1 = normalize(csr1, norm='l2', axis=1)
        csr2 = csr_matrix(csr2).astype('float64',copy=False)
        csr2 = normalize(csr2, norm='l2', axis=1)
        intrsct = dot_product(csr1,csr2.T, topn, min_value)        
    return intrsct
Beispiel #3
0
def pairwise_cosine_sparse(csr, topn=2000, min_value=0.1):
    """Computes the cosine distance between the rows of `csr`,
    smaller than the cut-off distance `epsilon`.
    """
    csr = csr_matrix(csr).astype(bool, copy=False).astype(int, copy=False)
    csr = csr.astype('float64', copy=False)
    csr = normalize(csr, norm='l2', axis=1)
    intrsct = dot_product(csr, csr.T, topn, min_value)
    intrsct.data[intrsct.data >= 1] = 1
    intrsct.data = 1 - intrsct.data

    return intrsct
def pairwise_cosine_sparse_sim(csr, topn=4000, min_value = 0.4,expected_density = 1, sparse = True,normalize = True):
    """Computes the cosine distance between the rows of `csr`,
    smaller than the cut-off distance `epsilon`.
    """   
    if sparse:
        csr = csr_matrix(csr).astype(bool,copy=False).astype(int,copy=False)
        csr = csr.astype('float64',copy=False)
        if normalize:
            csr = normalize(csr, norm='l2', axis=1)
        else:
            pass
        intrsct = dot_product(csr,csr.T, topn, min_value, expected_density)
        #intrsct.data[intrsct.data>=1] = 1        
    else:
        csr = csr.astype('float64',copy=False)
        if normalize:
            csr = normalize(csr, norm='l2', axis=1)
        else:
            pass
        intrsct = dot_product(csr,csr.T, topn, min_value, expected_density)
        intrsct.data[intrsct.data<=0] = 0
    return intrsct        
Beispiel #5
0
def pairwise_jaccard_sparse(csr, topn=2000, min_value=2):
    """Computes the Jaccard distance between the rows of `csr`,
    smaller than the cut-off distance `epsilon`.
    """
    csr = csr_matrix(csr).astype(bool, copy=False).astype(int, copy=False)
    csr = csr.astype('float64', copy=False)
    csr_rownnz = csr.getnnz(axis=1)
    intrsct = dot_product(csr, csr.T, topn, min_value - 0.1)

    nnz_i = np.repeat(csr_rownnz, intrsct.getnnz(axis=1))
    unions = nnz_i + csr_rownnz[intrsct.indices] - intrsct.data
    intrsct.data = 1.0 - intrsct.data / unions

    return intrsct