def cosine_sparse(csr1, csr2, topn=10, min_value=0.4, dense=True, similarity=True, expected_density=1): """Computes the cosine distance between the rows of `csr`, smaller than the cut-off distance `epsilon`. """ if not dense: csr1 = csr_matrix(csr1).astype(bool, copy=False).astype(int, copy=False) csr1 = csr1.astype('float64', copy=False) csr1 = normalize(csr1, norm='l2', axis=1) csr2 = csr_matrix(csr2).astype(bool, copy=False).astype(int, copy=False) csr2 = csr2.astype('float64', copy=False) csr2 = normalize(csr2, norm='l2', axis=1) intrsct = dot_product(csr1, csr2.T, topn, min_value, expected_density) intrsct.data[intrsct.data >= 1] = 1 if not similarity: intrsct.data = 1 - intrsct.data else: csr1 = csr_matrix(csr1).astype('float64', copy=False) csr1 = normalize(csr1, norm='l2', axis=1) csr2 = csr_matrix(csr2).astype('float64', copy=False) csr2 = normalize(csr2, norm='l2', axis=1) intrsct = dot_product(csr1, csr2.T, topn, min_value, expected_density) if not similarity: intrsct.data = 1 - intrsct.data return intrsct
def pairwise_cosine_sparse_sim(csr1,csr2, topn=4000, min_value = 0.4, word_embedings = True): """Computes the cosine distance between the rows of `csr`, smaller than the cut-off distance `epsilon`. """ if not word_embedings: csr1 = csr_matrix(csr1).astype(bool,copy=False).astype(int,copy=False) csr1 = csr1.astype('float64',copy=False) csr1 = normalize(csr1, norm='l2', axis=1) csr2 = csr_matrix(csr2).astype(bool,copy=False).astype(int,copy=False) csr2 = csr2.astype('float64',copy=False) csr2 = normalize(csr2, norm='l2', axis=1) intrsct = dot_product(csr1,csr2.T, topn, min_value) intrsct.data[intrsct.data>=1] = 1 else: csr1 = csr_matrix(csr1).astype('float64',copy=False) csr1 = normalize(csr1, norm='l2', axis=1) csr2 = csr_matrix(csr2).astype('float64',copy=False) csr2 = normalize(csr2, norm='l2', axis=1) intrsct = dot_product(csr1,csr2.T, topn, min_value) return intrsct
def pairwise_cosine_sparse(csr, topn=2000, min_value=0.1): """Computes the cosine distance between the rows of `csr`, smaller than the cut-off distance `epsilon`. """ csr = csr_matrix(csr).astype(bool, copy=False).astype(int, copy=False) csr = csr.astype('float64', copy=False) csr = normalize(csr, norm='l2', axis=1) intrsct = dot_product(csr, csr.T, topn, min_value) intrsct.data[intrsct.data >= 1] = 1 intrsct.data = 1 - intrsct.data return intrsct
def pairwise_cosine_sparse_sim(csr, topn=4000, min_value = 0.4,expected_density = 1, sparse = True,normalize = True): """Computes the cosine distance between the rows of `csr`, smaller than the cut-off distance `epsilon`. """ if sparse: csr = csr_matrix(csr).astype(bool,copy=False).astype(int,copy=False) csr = csr.astype('float64',copy=False) if normalize: csr = normalize(csr, norm='l2', axis=1) else: pass intrsct = dot_product(csr,csr.T, topn, min_value, expected_density) #intrsct.data[intrsct.data>=1] = 1 else: csr = csr.astype('float64',copy=False) if normalize: csr = normalize(csr, norm='l2', axis=1) else: pass intrsct = dot_product(csr,csr.T, topn, min_value, expected_density) intrsct.data[intrsct.data<=0] = 0 return intrsct
def pairwise_jaccard_sparse(csr, topn=2000, min_value=2): """Computes the Jaccard distance between the rows of `csr`, smaller than the cut-off distance `epsilon`. """ csr = csr_matrix(csr).astype(bool, copy=False).astype(int, copy=False) csr = csr.astype('float64', copy=False) csr_rownnz = csr.getnnz(axis=1) intrsct = dot_product(csr, csr.T, topn, min_value - 0.1) nnz_i = np.repeat(csr_rownnz, intrsct.getnnz(axis=1)) unions = nnz_i + csr_rownnz[intrsct.indices] - intrsct.data intrsct.data = 1.0 - intrsct.data / unions return intrsct