def fit(self, topK=50, shrink=100, similarity='cosine', feature_weighting="none", **similarity_args): # Similaripy returns also self similarity, which will be set to 0 afterwards topK += 1 self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') elif feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') if similarity == "cosine": self.W_sparse = sim.cosine(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "jaccard": self.W_sparse = sim.jaccard(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "dice": self.W_sparse = sim.dice(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "jaccard": self.W_sparse = sim.tversky(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "splus": self.W_sparse = sim.s_plus(self.URM_train, k=topK, shrink=shrink, **similarity_args) else: raise ValueError( "Unknown value '{}' for similarity".format(similarity)) self.W_sparse.setdiag(0) self.W_sparse = self.W_sparse.transpose().tocsr()
def check_similarity(m, k, rtol=0.0001, full=False): # cython dot = sim.dot_product(m, k=k) cosine = sim.cosine(m, k=k) asy_cosine = sim.asymmetric_cosine(m, alpha=0.2, k=k) jaccard = sim.jaccard(m, k=k) dice = sim.dice(m, k=k) tversky = sim.tversky(m, alpha=0.8, beta=0.4, k=k) p3alpha = sim.p3alpha(m, alpha=0.8, k=k) rp3beta = sim.rp3beta(m, alpha=0.8, beta=0.4, k=k) # python dot2 = py_dot(m, k) cosine2 = py_cosine(m, k).tocsr() asy_cosine2 = py_asy_cosine(m, 0.2, k=k) jaccard2 = py_jaccard(m, k) dice2 = py_dice(m, k) tversky2 = py_tversky(m, alpha=0.8, beta=0.4, k=k) p3alpha2 = py_p3alpha(m, alpha=0.8, k=k) rp3beta2 = py_rp3beta(m, alpha=0.8, beta=0.4, k=k) # test np.testing.assert_allclose(check_sum(dot), check_sum(dot2), rtol=rtol, err_msg='dot error') np.testing.assert_allclose(check_sum(cosine), check_sum(cosine2), rtol=rtol, err_msg='cosine error') np.testing.assert_allclose(check_sum(asy_cosine), check_sum(asy_cosine2), rtol=rtol, err_msg='asy_cosine error') np.testing.assert_allclose(check_sum(jaccard), check_sum(jaccard2), rtol=rtol, err_msg='jaccard error') np.testing.assert_allclose(check_sum(dice), check_sum(dice2), rtol=rtol, err_msg='dice error') np.testing.assert_allclose(check_sum(tversky), check_sum(tversky2), rtol=rtol, err_msg='tversky error') np.testing.assert_allclose(check_sum(p3alpha), check_sum(p3alpha2), rtol=rtol, err_msg='p3alpha error') np.testing.assert_allclose(check_sum(rp3beta), check_sum(rp3beta2), rtol=rtol, err_msg='rp3beta error') # test full rows if full: np.testing.assert_(check_full(dot, dot2, rtol) == 0, msg='dot error') np.testing.assert_(check_full(cosine, cosine2, rtol) == 0, msg='cosine error') np.testing.assert_(check_full(asy_cosine, asy_cosine2, rtol) == 0, msg='asy_cosine error') np.testing.assert_(check_full(jaccard, jaccard2, rtol) == 0, msg='jaccard error') np.testing.assert_(check_full(dice, dice2, rtol) == 0, msg='dice error') np.testing.assert_(check_full(tversky, tversky2, rtol) == 0, msg='tversky error') np.testing.assert_(check_full(p3alpha, p3alpha2, rtol) == 0, msg='p3alpha error') np.testing.assert_(check_full(rp3beta, rp3beta2, rtol) == 0, msg='rp3beta error') return
def similarity(matrix, k=100, sim_type='cosine', shrink=0, threshold=0, implicit=True, alpha=None, beta=None, l=None, c=None): # similarity type SIM_COSINE = 'cosine' SIM_TVERSKY = 'tversky' SIM_P3ALPHA = 'p3alpha' SIM_ASYMCOSINE = 'asymcosine' SIM_RP3BETA = 'rp3beta' SIM_SPLUS = 'splus' SIM_JACCARD = 'jaccard' SIM_DICE = 'dice' matrix = matrix.T if sim_type == SIM_COSINE: return sim.cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif sim_type == SIM_ASYMCOSINE: return sim.asymmetric_cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif sim_type == SIM_JACCARD: return sim.jaccard(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif sim_type == SIM_TVERSKY: return sim.tversky(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif sim_type == SIM_P3ALPHA: return sim.p3alpha(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif sim_type == SIM_RP3BETA: return sim.rp3beta(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif sim_type == SIM_SPLUS: return sim.s_plus(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) #, l=l, t1=alpha, t2=beta, c=c) elif sim_type == SIM_DICE: return sim.dice(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) else: print('Error wrong distance metric')
def fit(self, matrix, k, distance, shrink=0, threshold=0, implicit=True, alpha=None, beta=None, l=None, c=None, verbose=False): """ Initialize the model and compute the similarity matrix S with a distance metric. Access the similarity matrix using: self._sim_matrix Parameters ---------- matrix : csr_matrix A sparse matrix. For example, it can be the URM of shape (number_users, number_items). k : int K nearest neighbour to consider. distance : str One of the supported distance metrics, check collaborative_filtering_base constants. shrink : float, optional Shrink term used in the normalization threshold: float, optional All the values under this value are cutted from the final result implicit: bool, optional If true, treat the URM as implicit, otherwise consider explicit ratings (real values) in the URM alpha: float, optional, included in [0,1] beta: float, optional, included in [0,1] l: float, optional, balance coefficient used in s_plus distance, included in [0,1] c: float, optional, cosine coefficient, included in [0,1] """ alpha = -1 if alpha is None else alpha beta = -1 if beta is None else beta l = -1 if l is None else l c = -1 if c is None else c if distance == self.SIM_ASYMCOSINE and not (0 <= alpha <= 1): log.error( 'Invalid parameter alpha in asymmetric cosine similarity!') return if distance == self.SIM_TVERSKY and not (0 <= alpha <= 1 and 0 <= beta <= 1): log.error('Invalid parameter alpha/beta in tversky similarity!') return if distance == self.SIM_P3ALPHA and alpha is None: log.error('Invalid parameter alpha in p3alpha similarity') return if distance == self.SIM_RP3BETA and alpha is None and beta is None: log.error('Invalid parameter alpha/beta in rp3beta similarity') return if distance == self.SIM_SPLUS and not (0 <= l <= 1 and 0 <= c <= 1 and 0 <= alpha <= 1 and 0 <= beta <= 1): log.error('Invalid parameter alpha/beta/l/c in s_plus similarity') return # compute and stores the similarity matrix using one of the distance metric: S = R•R' if distance == self.SIM_COSINE: self._sim_matrix = sim.cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif distance == self.SIM_ASYMCOSINE: self._sim_matrix = sim.asymmetric_cosine(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif distance == self.SIM_JACCARD: self._sim_matrix = sim.jaccard(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif distance == self.SIM_DICE: self._sim_matrix = sim.dice(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) elif distance == self.SIM_TVERSKY: self._sim_matrix = sim.tversky(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif distance == self.SIM_P3ALPHA: self._sim_matrix = sim.p3alpha(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha) elif distance == self.SIM_RP3BETA: self._sim_matrix = sim.rp3beta(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, alpha=alpha, beta=beta) elif distance == self.SIM_SPLUS: self._sim_matrix = sim.s_plus(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit, l=l, t1=alpha, t2=beta, c=c) else: log.error('Invalid distance metric: {}'.format(distance)) #self.SIM_DOTPRODUCT: sim.dot_product(matrix, k=k, shrink=shrink, threshold=threshold, binary=implicit) return self._sim_matrix
def fit(self): self.alpha = -1 if self.alpha is None else self.alpha self.beta = -1 if self.beta is None else self.beta self.l = -1 if self.l is None else self.l self.c = -1 if self.c is None else self.c if self.distance == self.SIM_ASYMCOSINE and not (0 <= self.alpha <= 1): log.error( 'Invalid parameter alpha in asymmetric cosine Similarity_MFD!') return if self.distance == self.SIM_TVERSKY and not (0 <= self.alpha <= 1 and 0 <= self.beta <= 1): log.error( 'Invalid parameter alpha/beta in tversky Similarity_MFD!') return if self.distance == self.SIM_P3ALPHA and self.alpha is None: log.error('Invalid parameter alpha in p3alpha Similarity_MFD') return if self.distance == self.SIM_RP3BETA and self.alpha is None and self.beta is None: log.error('Invalid parameter alpha/beta in rp3beta Similarity_MFD') return if self.distance == self.SIM_SPLUS and not ( 0 <= self.l <= 1 and 0 <= self.c <= 1 and 0 <= self.alpha <= 1 and 0 <= self.beta <= 1): log.error( 'Invalid parameter alpha/beta/l/c in s_plus Similarity_MFD') return # compute and stores the Similarity_MFD matrix using one of the distance metric: S = R•R' if self.distance == self.SIM_COSINE: self._sim_matrix = sim.cosine(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit) elif self.distance == self.SIM_ASYMCOSINE: self._sim_matrix = sim.asymmetric_cosine(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha) elif self.distance == self.SIM_JACCARD: self._sim_matrix = sim.jaccard(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit) elif self.distance == self.SIM_DICE: self._sim_matrix = sim.dice(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit) elif self.distance == self.SIM_TVERSKY: self._sim_matrix = sim.tversky(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha, beta=self.beta) elif self.distance == self.SIM_P3ALPHA: self._sim_matrix = sim.p3alpha(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha) elif self.distance == self.SIM_RP3BETA: self._sim_matrix = sim.rp3beta(self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, alpha=self.alpha, beta=self.beta) elif self.distance == self.SIM_SPLUS: self._sim_matrix = prep.normalize(sim.s_plus( self.matrix, k=self.k, shrink=self.shrink, threshold=self.threshold, binary=self.implicit, l=self.l, t1=self.alpha, t2=self.beta, c=self.c), norm='l2', axis=0) else: log.error('Invalid distance metric: {}'.format(self.distance)) return self._sim_matrix