Beispiel #1
0
    def name_dupe_similarity(
            cls,
            a1_name,
            a2_name,
            word_index,
            languages=None,
            likely_dupe_threshold=DedupeResponse.default_name_dupe_threshold,
            needs_review_threshold=DedupeResponse.default_name_review_threshold
    ):
        a1_name_tokens = Name.content_tokens(a1_name, languages=languages)
        a2_name_tokens = Name.content_tokens(a2_name, languages=languages)
        if not a1_name_tokens or not a2_name_tokens:
            return None, 0.0

        a1_scores = cls.word_scores_normalized(a1_name_tokens, word_index)
        a2_scores = cls.word_scores_normalized(a2_name_tokens, word_index)

        return is_name_duplicate_fuzzy(
            a1_name_tokens,
            a1_scores,
            a2_name_tokens,
            a2_scores,
            languages=languages,
            likely_dupe_threshold=likely_dupe_threshold,
            needs_review_threshold=needs_review_threshold)
Beispiel #2
0
    def name_dupe_fuzzy(cls, a1_name_tokens, a1_scores_norm, a2_name_tokens, a2_scores_norm, languages=None, likely_dupe_threshold=DedupeResponse.default_name_dupe_threshold,
                        needs_review_threshold=DedupeResponse.default_name_review_threshold):
        if not a1_name_tokens or not a2_name_tokens:
            return None, 0.0

        return is_name_duplicate_fuzzy(a1_name_tokens, a1_scores_norm, a2_name_tokens, a2_scores_norm, languages=languages,
                                       likely_dupe_threshold=likely_dupe_threshold, needs_review_threshold=needs_review_threshold)
Beispiel #3
0
    def name_dupe_similarity(
            cls,
            a1_name,
            a2_name,
            tfidf,
            languages=None,
            likely_dupe_threshold=DedupeResponse.default_name_dupe_threshold,
            needs_review_threshold=DedupeResponse.default_name_review_threshold
    ):
        a1_name_tokens = Name.content_tokens(a1_name)
        a2_name_tokens = Name.content_tokens(a2_name)
        if not a1_name_tokens or not a2_name_tokens:
            return None, 0.0

        a1_name_tokens, a1_tfidf_norm = list(
            zip(*cls.tfidf_vector_normalized(a1_name_tokens, tfidf)))
        a2_name_tokens, a2_tfidf_norm = list(
            zip(*cls.tfidf_vector_normalized(a2_name_tokens, tfidf)))

        return is_name_duplicate_fuzzy(
            a1_name_tokens,
            a1_tfidf_norm,
            a2_name_tokens,
            a2_tfidf_norm,
            languages=languages,
            likely_dupe_threshold=likely_dupe_threshold,
            needs_review_threshold=needs_review_threshold)