def compute_tf(self, token_vectors): """Return the tf for entire sentences""" tf = collections.Counter() for v in token_vectors: tf += analysisTool.compute_tf(v) return tf
def test_compute_tf(self): """Does it successfully compute tf for a given sentence?""" input_tokens = [ "never", "stop", "stop", "sign" ] expected = {"never": 1, "stop": 2, "sign": 1} result = analysisTool.compute_tf(input_tokens) self.assertDictEqual(expected, result)
def compute_cosine(self, token_vectors, treshold): """Return squre matrix of size len(token_vectors) with each element represents cosine similarities of corresponding sentences """ n = len(token_vectors) tf = [analysisTool.compute_tf(v) for v in token_vectors] idf = analysisTool.compute_idf(token_vectors) cosine_matrix = numpy.zeros((n, n)) for row in xrange(n): for col in xrange(row, n): v1 = token_vectors[row] v2 = token_vectors[col] # diagonal values set to 1 if row == col: cosine_matrix[row][col] = 1 continue # sentences with only few words are skipped since # they tend to get higher scores if len(v1) < 3 or len(v2) < 3: continue common_words = set(v1) & set(v2) # no common words means numerator = 0 if len(common_words) == 0: continue numerator = sum(tf[row][word] * tf[col][word] * pow(idf[word], 2) for word in common_words) d1 = sum(pow(tf[row][word] * idf[word], 2) for word in v1) d2 = sum(pow(tf[col][word] * idf[word], 2) for word in v2) denominator = math.sqrt(d1) * math.sqrt(d2) if numerator > 0 and numerator / denominator > treshold: cosine_matrix[row][col] = 1.0 cosine_matrix[col][row] = 1.0 return cosine_matrix