def match(self, queryWords, docWords): words = self.sharedWords(queryWords, docWords) idf = [self.invDocFreq(w) for w in words] docVec = self.wordVector(words, docWords) maxFreq = docVec.largest() docVec = Vector([docVec[i] / maxFreq * idf[i] for i in range(len(words))]) queryVec = self.wordVector(words, queryWords) maxFreq = queryVec.largest() queryVec = Vector([0 if idf[i] == 0 else \ (0.5 + (0 if maxFreq == 0 else 0.5*queryVec[i]/maxFreq)) \ * idf[i] for i in range(len(words))]) return queryVec.dot(docVec) / \ (queryVec.length() * docVec.length())
def testCosineSimilarity(self): matcher = matching.CosineSimilarity({ 'docFreq': { 'a': 4, 'b': 2, 'c': 1, }, 'docCount': 16 }) # ---- test one ----- docWords = {'a': 13, 'c': 9} queryWords = {'a': 7, 'b': 4} idf = {w: matcher.invDocFreq(w) for w in ['a', 'b', 'c']} docVec = Vector([13/13*idf['a'], 0, 9/13*idf['c']]) queryVec = Vector([(0.5+0.5*7/7)*idf['a'], (0.5+0.5*4/7)*idf['b'], 0.5*idf['c']]) scoreComputed = docVec[0]*queryVec[0] + docVec[1]*queryVec[1] + \ docVec[2]*queryVec[2] scoreComputed = scoreComputed / docVec.length() scoreComputed = scoreComputed / queryVec.length() score = matcher.match(queryWords, docWords) self.assertApproxEqual(score, scoreComputed) # ---- test two ----- docWords = {'a': 13, 'b': 9, 'c': 1} queryWords = {'a': 7, 'b': 4} docVec = Vector([13/13*idf['a'], 9/13*idf['b'], 1/13*idf['c']]) queryVec = Vector([(0.5+0.5*7/7)*idf['a'], (0.5+0.5*4/7)*idf['b'], 0.5*idf['c']]) scoreComputed = docVec[0]*queryVec[0] + docVec[1]*queryVec[1] + \ docVec[2]*queryVec[2] scoreComputed = scoreComputed / docVec.length() scoreComputed = scoreComputed / queryVec.length() score = matcher.match(queryWords, docWords) self.assertApproxEqual(score, scoreComputed) # ---- test three ----- docWords = {'a': 13, 'b': 9} queryWords = {'a': 7, 'b': 4, 'c': 1} docVec = Vector([13/13*idf['a'], 9/13*idf['b'], 0]) queryVec = Vector([(0.5+0.5*7/7)*idf['a'], (0.5+0.5*4/7)*idf['b'], (0.5+0.5*1/7)*idf['c']]) scoreComputed = docVec[0]*queryVec[0] + docVec[1]*queryVec[1] + \ docVec[2]*queryVec[2] scoreComputed = scoreComputed / docVec.length() scoreComputed = scoreComputed / queryVec.length() score = matcher.match(queryWords, docWords) self.assertApproxEqual(score, scoreComputed)
def test_length(self): vec = Vector([3,4]) length = vec.length() self.assertEqual(length, 5)