def compute_ratings(self, token_vectors): """Return the ratings of words""" tf = self.compute_tf(token_vectors) idf = analysisTool.compute_idf(token_vectors) rating = {} for word in tf: rating[word] = tf[word] * idf[word] return rating
def test_commpute_idf(self): """Does it successfully compute idf for given sentences?""" input_vectors = [ ["this", "is", "a", "sample"], ["this", "is", "another", "example"] ] result = analysisTool.compute_idf(input_vectors) expected_keys = ["this", "is", "a", "sample", "another", "example"] for key in expected_keys: self.assertIn(key, result.keys()) self.assertEqual(0, result["this"]) self.assertEqual(0, result["is"])
def compute_cosine(self, token_vectors, treshold): """Return squre matrix of size len(token_vectors) with each element represents cosine similarities of corresponding sentences """ n = len(token_vectors) tf = [analysisTool.compute_tf(v) for v in token_vectors] idf = analysisTool.compute_idf(token_vectors) cosine_matrix = numpy.zeros((n, n)) for row in xrange(n): for col in xrange(row, n): v1 = token_vectors[row] v2 = token_vectors[col] # diagonal values set to 1 if row == col: cosine_matrix[row][col] = 1 continue # sentences with only few words are skipped since # they tend to get higher scores if len(v1) < 3 or len(v2) < 3: continue common_words = set(v1) & set(v2) # no common words means numerator = 0 if len(common_words) == 0: continue numerator = sum(tf[row][word] * tf[col][word] * pow(idf[word], 2) for word in common_words) d1 = sum(pow(tf[row][word] * idf[word], 2) for word in v1) d2 = sum(pow(tf[col][word] * idf[word], 2) for word in v2) denominator = math.sqrt(d1) * math.sqrt(d2) if numerator > 0 and numerator / denominator > treshold: cosine_matrix[row][col] = 1.0 cosine_matrix[col][row] = 1.0 return cosine_matrix