def match(self, queryWords, docWords):
        words = self.sharedWords(queryWords, docWords)
        idf = [self.invDocFreq(w) for w in words]

        docVec = self.wordVector(words, docWords)
        maxFreq = docVec.largest()
        docVec = Vector([docVec[i] / maxFreq * idf[i]
                for i in range(len(words))])

        queryVec = self.wordVector(words, queryWords)
        maxFreq = queryVec.largest()
        queryVec = Vector([0 if idf[i] == 0 else \
                (0.5 + (0 if maxFreq == 0 else 0.5*queryVec[i]/maxFreq)) \
                * idf[i] for i in range(len(words))])

        return queryVec.dot(docVec) / \
                (queryVec.length() * docVec.length())
    def testCosineSimilarity(self):
        matcher = matching.CosineSimilarity({
            'docFreq': {
                'a': 4,
                'b': 2,
                'c': 1,
            },
            'docCount': 16
        })

        # ---- test one -----
        docWords = {'a': 13, 'c': 9}
        queryWords = {'a': 7, 'b': 4}

        idf = {w: matcher.invDocFreq(w) for w in ['a', 'b', 'c']}

        docVec = Vector([13/13*idf['a'], 0, 9/13*idf['c']])
        queryVec = Vector([(0.5+0.5*7/7)*idf['a'], (0.5+0.5*4/7)*idf['b'],
                0.5*idf['c']])

        scoreComputed = docVec[0]*queryVec[0] + docVec[1]*queryVec[1] + \
                docVec[2]*queryVec[2]
        scoreComputed = scoreComputed / docVec.length()
        scoreComputed = scoreComputed / queryVec.length()

        score = matcher.match(queryWords, docWords)
        self.assertApproxEqual(score, scoreComputed)

        # ---- test two -----
        docWords = {'a': 13, 'b': 9, 'c': 1}
        queryWords = {'a': 7, 'b': 4}
        docVec = Vector([13/13*idf['a'], 9/13*idf['b'], 1/13*idf['c']])
        queryVec = Vector([(0.5+0.5*7/7)*idf['a'], (0.5+0.5*4/7)*idf['b'],
                0.5*idf['c']])
        scoreComputed = docVec[0]*queryVec[0] + docVec[1]*queryVec[1] + \
                docVec[2]*queryVec[2]
        scoreComputed = scoreComputed / docVec.length()
        scoreComputed = scoreComputed / queryVec.length()

        score = matcher.match(queryWords, docWords)
        self.assertApproxEqual(score, scoreComputed)

        # ---- test three -----
        docWords = {'a': 13, 'b': 9}
        queryWords = {'a': 7, 'b': 4, 'c': 1}
        docVec = Vector([13/13*idf['a'], 9/13*idf['b'], 0])
        queryVec = Vector([(0.5+0.5*7/7)*idf['a'], (0.5+0.5*4/7)*idf['b'],
                (0.5+0.5*1/7)*idf['c']])
        scoreComputed = docVec[0]*queryVec[0] + docVec[1]*queryVec[1] + \
                docVec[2]*queryVec[2]
        scoreComputed = scoreComputed / docVec.length()
        scoreComputed = scoreComputed / queryVec.length()

        score = matcher.match(queryWords, docWords)
        self.assertApproxEqual(score, scoreComputed)
 def test_length(self):
   vec = Vector([3,4])
   length = vec.length()
   self.assertEqual(length, 5)