Esempio n. 1
0
    def street_dupe_status(cls, street1, street2, languages=None, fuzzy=False):
        if languages is None:
            languages1 = place_languages(['road'], [street1])
            languages2 = place_languages(['road'], [street2])
            if languages1 is not None and languages2 is not None:
                languages = combined_languages(languages1, languages2)
            else:
                languages = languages1 or languages2 or DEFAULT_LANGUAGES

        street_status = is_street_duplicate(street1, street2, languages=languages)
        same_street = street_status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE)
        if street_status == duplicate_status.EXACT_DUPLICATE:
            street_sim = 1.0
        elif street_status == duplicate_status.NEEDS_REVIEW:
            street_sim = 0.5
        elif street_status == duplicate_status.LIKELY_DUPLICATE:
            street_sim = 0.9
        else:
            street_sim = 0.0

        if same_street:
            return Dupe(status=street_status, sim=street_sim)
        elif fuzzy:
            a1_street_tokens = Name.content_tokens(street1, languages=languages)
            a1_scores_norm = WordIndex.normalized_vector([1] * len(a1_street_tokens))
            a2_street_tokens = Name.content_tokens(street2, languages=languages)
            a2_scores_norm = WordIndex.normalized_vector([1] * len(a2_street_tokens))
            if a1_street_tokens and a2_street_tokens and a1_scores_norm and a2_scores_norm:
                street_status, street_sim = is_street_duplicate_fuzzy(a1_street_tokens, a1_scores_norm, a2_street_tokens, a2_scores_norm, languages=languages)
                return Dupe(status=street_status, sim=street_sim)
            else:
                return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim)
        else:
            return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim)
Esempio n. 2
0
    def address_dupe_status(cls, a1, a2, languages=None, fuzzy_street_name=False):
        a1_street = a1.get(AddressComponents.STREET)
        a2_street = a2.get(AddressComponents.STREET)

        if a1_street:
            a1_street = a1_street.strip()
        if a2_street:
            a2_street = a2_street.strip()

        a1_house_number = a1.get(AddressComponents.HOUSE_NUMBER)
        a2_house_number = a2.get(AddressComponents.HOUSE_NUMBER)

        if a1_house_number:
            a1_house_number = a1_house_number.strip()
        if a2_house_number:
            a2_house_number = a2_house_number.strip()

        a1_base_house_number = a1.get(AddressComponents.HOUSE_NUMBER_BASE)
        a2_base_house_number = a2.get(AddressComponents.HOUSE_NUMBER_BASE)

        if a1_base_house_number:
            a1_base_house_number = a1_base_house_number.strip()
        if a2_base_house_number:
            a2_base_house_number = a2_base_house_number.strip()

        if (a1_street and not a2_street) or (a2_street and not a1_street):
            return (duplicate_status.NON_DUPLICATE, 0.0)

        if (a1_house_number and not a2_house_number) or (a2_house_number and not a1_house_number):
            return (duplicate_status.NON_DUPLICATE, 0.0)

        have_street = a1_street and a2_street
        same_street = False
        street_status = duplicate_status.NON_DUPLICATE

        street_sim = 0.0
        if have_street:
            street_status = is_street_duplicate(a1_street, a2_street, languages=languages)
            same_street = street_status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE)
            if street_status == duplicate_status.EXACT_DUPLICATE:
                street_sim = 1.0
            elif street_status == duplicate_status.NEEDS_REVIEW:
                street_sim = 0.5
            elif street_status == duplicate_status.LIKELY_DUPLICATE:
                street_sim = 0.9
            else:
                street_sim = 0.0
            if not same_street and fuzzy_street_name:
                a1_street_tokens = Name.content_tokens(a1_street, languages=languages)
                a1_scores_norm = WordIndex.normalized_vector([1] * len(a1_street_tokens))
                a2_street_tokens = Name.content_tokens(a2_street, languages=languages)
                a2_scores_norm = WordIndex.normalized_vector([1] * len(a2_street_tokens))
                if a1_street_tokens and a2_street_tokens and a1_scores_norm and a2_scores_norm:
                    street_status, street_sim = is_street_duplicate_fuzzy(a1_street_tokens, a1_scores_norm, a2_street_tokens, a2_scores_norm, languages=languages)
                    same_street = street_status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE)
            if not same_street:
                return (duplicate_status.NON_DUPLICATE, 0.0)

        have_house_number = a1_house_number and a2_house_number
        have_base_house_number = a1_base_house_number or a2_base_house_number
        same_house_number = False
        house_number_status = duplicate_status.NON_DUPLICATE
        house_number_sim = 0.0

        if have_house_number:
            house_number_status = is_house_number_duplicate(a1_house_number, a2_house_number, languages=languages)
            same_house_number = house_number_status == duplicate_status.EXACT_DUPLICATE
            if same_house_number:
                house_number_sim = 1.0

            if have_base_house_number and not same_house_number:
                a1h = a1_base_house_number or a1_house_number
                a2h = a2_base_house_number or a2_house_number

                base_house_number_status = is_house_number_duplicate(a1h, a2h, languages=languages)
                same_house_number = base_house_number_status == duplicate_status.EXACT_DUPLICATE
                if same_house_number:
                    house_number_status = duplicate_status.LIKELY_DUPLICATE
                    house_number_sim = 0.9

            if not same_house_number:
                return (duplicate_status.NON_DUPLICATE, house_number_sim)

        if not have_house_number and not have_street:
            return (duplicate_status.NON_DUPLICATE, 0.0)

        if have_street and same_street and (same_house_number or not have_house_number):
            return min((street_status, street_sim), (house_number_status, house_number_sim))
        elif have_house_number and same_house_number:
            return (house_number_status, house_number_sim)
        elif have_street and same_street:
            return (street_status, street_sim)

        return (duplicate_status.NON_DUPLICATE, 0.0)