def name_dupe_similarity( cls, a1_name, a2_name, word_index, languages=None, likely_dupe_threshold=DedupeResponse.default_name_dupe_threshold, needs_review_threshold=DedupeResponse.default_name_review_threshold ): a1_name_tokens = Name.content_tokens(a1_name, languages=languages) a2_name_tokens = Name.content_tokens(a2_name, languages=languages) if not a1_name_tokens or not a2_name_tokens: return None, 0.0 a1_scores = cls.word_scores_normalized(a1_name_tokens, word_index) a2_scores = cls.word_scores_normalized(a2_name_tokens, word_index) return is_name_duplicate_fuzzy( a1_name_tokens, a1_scores, a2_name_tokens, a2_scores, languages=languages, likely_dupe_threshold=likely_dupe_threshold, needs_review_threshold=needs_review_threshold)
def name_dupe_fuzzy(cls, a1_name_tokens, a1_scores_norm, a2_name_tokens, a2_scores_norm, languages=None, likely_dupe_threshold=DedupeResponse.default_name_dupe_threshold, needs_review_threshold=DedupeResponse.default_name_review_threshold): if not a1_name_tokens or not a2_name_tokens: return None, 0.0 return is_name_duplicate_fuzzy(a1_name_tokens, a1_scores_norm, a2_name_tokens, a2_scores_norm, languages=languages, likely_dupe_threshold=likely_dupe_threshold, needs_review_threshold=needs_review_threshold)
def name_dupe_similarity( cls, a1_name, a2_name, tfidf, languages=None, likely_dupe_threshold=DedupeResponse.default_name_dupe_threshold, needs_review_threshold=DedupeResponse.default_name_review_threshold ): a1_name_tokens = Name.content_tokens(a1_name) a2_name_tokens = Name.content_tokens(a2_name) if not a1_name_tokens or not a2_name_tokens: return None, 0.0 a1_name_tokens, a1_tfidf_norm = list( zip(*cls.tfidf_vector_normalized(a1_name_tokens, tfidf))) a2_name_tokens, a2_tfidf_norm = list( zip(*cls.tfidf_vector_normalized(a2_name_tokens, tfidf))) return is_name_duplicate_fuzzy( a1_name_tokens, a1_tfidf_norm, a2_name_tokens, a2_tfidf_norm, languages=languages, likely_dupe_threshold=likely_dupe_threshold, needs_review_threshold=needs_review_threshold)