def dist_txt(self, x, y): """ Calculates distance between 2 given texts. It calculates 1 - (set(x)&set(y) / set(x)|set(y)). TESTED. """ x_s = set(text_to_words(x)) y_s = set(text_to_words(y)) return len(x_s & y_s)/len(x_s | y_s)
def dist_txt(self, x, y): """ Calculates distance between 2 given texts. It calculates 1 - (set(x)&set(y) / set(x)|set(y)). TESTED. """ x_s = set(text_to_words(x)) y_s = set(text_to_words(y)) return len(x_s & y_s) / len(x_s | y_s)
def dist_txt(self, x, y): """ Calculates distance between 2 given texts. It calculates more or less 1 - (set(x)&set(y) / set(x)|set(y)). More or less, because the repetitions indicate the similarity as well. TESTED. """ #x_s = set(text_to_words(x)) #y_s = set(text_to_words(y)) #return len(x_s & y_s)/len(x_s | y_s) x_words = sorted( filter(lambda x: x not in self.stopwords ,text_to_words(x))) y_words = sorted( filter(lambda x: x not in self.stopwords ,text_to_words(y))) x_ind = 0 y_ind = 0 diff = 0 while x_ind < len(x_words) or y_ind < len(y_words): if x_ind==len(x_words): diff+=len(y_words)-y_ind break elif y_ind==len(y_words): diff+=len(x_words)-x_ind break else: if x_words[x_ind] > y_words[y_ind]: y_ind+=1 diff+=1 elif x_words[x_ind] < y_words[y_ind]: x_ind+=1 diff+=1 else: x_ind+=1 y_ind+=1 return diff/(len(x_words)+len(y_words))