def compression_dist(x, y, l_x=None, l_y=None): if x == y: return 0 x_b = x.encode('utf-8') y_b = y.encode('utf-8') if l_x is None: l_x = len(lzma.compress(x_b)) l_y = len(lzma.compress(y_b)) l_xy = len(lzma.compress(x_b + y_b)) l_yx = len(lzma.compress(y_b + x_b)) dist = try_divide(min(l_xy, l_yx) - min(l_x, l_y), max(l_x, l_y)) return dist
def lzma_ratio(a, b): '''Similarity after compressed using lzma''' if a == b: return 1 a, b = a.encode('utf-8'), b.encode('utf-8') a_len = len(lzma.compress(a)) b_len = len(lzma.compress(b)) ab_len = len(lzma.compress(a + b)) ba_len = len(lzma.compress(b + a)) ratio = 1 - try_divide( min(ab_len, ba_len) - min(a_len, b_len), max(a_len, b_len)) return ratio
def cosine_distance(a, b): return np.nan_to_num( try_divide(np.dot(a, b), np.linalg.norm(a) * np.linalg.norm(b)))
def dice_ratio(a, b): a, b = set(a), set(b) return try_divide(2 * len(a & b), len(a) + len(b))
def jaccard_ratio(a, b): a, b = set(a), set(b) c = a & b return try_divide(len(c), len(a) + len(b) - len(c))
def longest_match_ratio(str1, str2): sq = SequenceMatcher(lambda x: x == " ", str1, str2) match = sq.find_longest_match(0, len(str1), 0, len(str2)) return try_divide(match.size, min(len(str1), len(str2)))