def __init__(self, definition) : super(TextType, self).__init__(definition) if 'corpus' not in definition : definition['corpus'] = [] self.comparator = CosineTextSimilarity(definition['corpus'])
def dedupe_cosine(s1, s2): s1_2 = pd.Series(list(zip(s1, s2))); # build corpus corpus_set = [] for index, value in s1_2.iteritems(): corpus_set.append(value[0]) corpus_set.append(value[1]) # init cosine instance cosine = CosineTextSimilarity(corpus_set) # calc similarity return s1_2.apply(lambda x: cosine(x[0], x[1]))
def test_cosine_na(self): cosine = CosineTextSimilarity(self.ilist) cosine_sim = cosine(self.ilist[0], '') assert numpy.isnan(cosine_sim)
def test_cosine_identical(self): cosine = CosineTextSimilarity(self.ilist) cosine_sim = cosine(self.ilist[0], self.ilist[0]) self.assertAlmostEqual(cosine_sim, 1, places=5)
def test_cosine(self): cosine = CosineTextSimilarity(self.ilist) s1 = self.ilist[0] s2 = self.ilist[1] cosine_sim = cosine(s1, s2) self.assertAlmostEqual(cosine_sim, 0.378, places=3)