def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.]]) arr = pyarray.array('d', [0, 1, 2, 3]) self.assertEqual(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) self.assertEqual(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) self.assertEqual(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat))) self.assertEqual(7.0, sv.dot(arr))
def cosine_similarity(rates1: SparseVector, rates2: SparseVector) -> Decimal: """Compute cosine similarity (https://en.wikipedia.org/wiki/Cosine_similarity) Args: rate1, rate2: pyspark.ml.linalg.SparseVector rate values by each user Returns: decimal.Decimal """ dot_rates = Decimal(rates1.dot(rates2)) sq_rates1 = Decimal(np.sum(np.square(rates1.values))) sq_rates2 = Decimal(np.sum(np.square(rates2.values))) return dot_rates / (np.sqrt(sq_rates1) * np.sqrt(sq_rates2))