def street_dupe_status(cls, street1, street2, languages=None, fuzzy=False): if languages is None: languages1 = place_languages(['road'], [street1]) languages2 = place_languages(['road'], [street2]) if languages1 is not None and languages2 is not None: languages = combined_languages(languages1, languages2) else: languages = languages1 or languages2 or DEFAULT_LANGUAGES street_status = is_street_duplicate(street1, street2, languages=languages) same_street = street_status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE) if street_status == duplicate_status.EXACT_DUPLICATE: street_sim = 1.0 elif street_status == duplicate_status.NEEDS_REVIEW: street_sim = 0.5 elif street_status == duplicate_status.LIKELY_DUPLICATE: street_sim = 0.9 else: street_sim = 0.0 if same_street: return Dupe(status=street_status, sim=street_sim) elif fuzzy: a1_street_tokens = Name.content_tokens(street1, languages=languages) a1_scores_norm = WordIndex.normalized_vector([1] * len(a1_street_tokens)) a2_street_tokens = Name.content_tokens(street2, languages=languages) a2_scores_norm = WordIndex.normalized_vector([1] * len(a2_street_tokens)) if a1_street_tokens and a2_street_tokens and a1_scores_norm and a2_scores_norm: street_status, street_sim = is_street_duplicate_fuzzy(a1_street_tokens, a1_scores_norm, a2_street_tokens, a2_scores_norm, languages=languages) return Dupe(status=street_status, sim=street_sim) else: return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim) else: return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim)
def dupe_class_and_sim(cls, a1, a2, word_index=None, likely_dupe_threshold=DedupeResponse.default_name_dupe_threshold, needs_review_threshold=DedupeResponse.default_name_review_threshold, with_address=True, with_unit=False, with_phone_number=True, fuzzy_street_name=False): a1_name = a1.get(AddressComponents.NAME) a2_name = a2.get(AddressComponents.NAME) if not a1_name or not a2_name: return NULL_DUPE a1_languages = cls.address_languages(a1) a2_languages = cls.address_languages(a2) languages = cls.combined_languages(a1_languages, a2_languages) if with_address: same_address = cls.is_address_dupe(a1, a2, languages=languages, fuzzy_street_name=fuzzy_street_name) if not same_address: return NULL_DUPE if with_unit: same_unit = cls.is_sub_building_dupe(a1, a2, languages=languages) if not same_unit: return NULL_DUPE name_dupe_class = cls.name_dupe_status(a1_name, a2_name, languages=languages) name_sim = 0.0 if name_dupe_class == duplicate_status.EXACT_DUPLICATE: name_sim = 1.0 elif name_dupe_class == duplicate_status.LIKELY_DUPLICATE: name_sim = likely_dupe_threshold elif name_dupe_class == duplicate_status.NEEDS_REVIEW: name_sim = needs_review_threshold else: return NULL_DUPE if word_index and name_dupe_class != duplicate_status.EXACT_DUPLICATE: name_fuzzy_dupe_class, name_fuzzy_sim = cls.name_dupe_similarity(a1_name, a2_name, word_index=word_index, languages=languages) if name_fuzzy_dupe_class >= name_dupe_class: name_dupe_class = name_fuzzy_dupe_class name_sim = name_fuzzy_sim phone_number_dupe = None if with_phone_number: name_dupe_class, phone_number_dupe = PhoneNumberDeduper.revised_dupe_class(name_dupe_class, a1, a2) return Dupe(name_dupe_class, name_sim)
def address_dupe_status(cls, a1, a2, languages=None, fuzzy_street_name=False): a1_street = a1.get(AddressComponents.STREET) a2_street = a2.get(AddressComponents.STREET) if a1_street: a1_street = a1_street.strip() if a2_street: a2_street = a2_street.strip() a1_house_number = a1.get(AddressComponents.HOUSE_NUMBER) a2_house_number = a2.get(AddressComponents.HOUSE_NUMBER) if a1_house_number: a1_house_number = safe_decode(a1_house_number).strip() if a2_house_number: a2_house_number = safe_decode(a2_house_number).strip() a1_base_house_number = a1.get(AddressComponents.HOUSE_NUMBER_BASE) a2_base_house_number = a2.get(AddressComponents.HOUSE_NUMBER_BASE) if a1_base_house_number: a1_base_house_number = safe_decode(a1_base_house_number).strip() if a2_base_house_number: a2_base_house_number = safe_decode(a2_base_house_number).strip() if (a1_street and not a2_street) or (a2_street and not a1_street): return NULL_DUPE if (a1_house_number and not a2_house_number) or (a2_house_number and not a1_house_number): return NULL_DUPE have_street = a1_street and a2_street same_street = False street_status = duplicate_status.NON_DUPLICATE street_sim = 0.0 if have_street: street_dupe_status = StreetDeduper.street_dupe_status(a1_street, a2_street, languages=languages, fuzzy=fuzzy_street_name) same_street = street_dupe_status.status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE) if not same_street: return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim) have_house_number = a1_house_number and a2_house_number have_base_house_number = a1_base_house_number or a2_base_house_number same_house_number = False house_number_status = duplicate_status.NON_DUPLICATE house_number_sim = 0.0 if have_house_number: house_number_status = is_house_number_duplicate(a1_house_number, a2_house_number, languages=languages) same_house_number = house_number_status == duplicate_status.EXACT_DUPLICATE if same_house_number: house_number_sim = 1.0 if have_base_house_number and not same_house_number: a1h = a1_base_house_number or a1_house_number a2h = a2_base_house_number or a2_house_number base_house_number_status = is_house_number_duplicate(a1h, a2h, languages=languages) same_house_number = base_house_number_status == duplicate_status.EXACT_DUPLICATE if same_house_number: house_number_status = duplicate_status.LIKELY_DUPLICATE house_number_sim = 0.9 if not same_house_number: return Dupe(status=duplicate_status.NON_DUPLICATE, sim=house_number_sim) if not have_house_number and not have_street: return NULL_DUPE if have_street and same_street and have_house_number and same_house_number: min_status, min_sim = min((street_dupe_status.status, street_dupe_status.sim), (house_number_status, house_number_sim)) return Dupe(status=min_status, sim=min_sim) elif have_house_number and same_house_number and not have_street: return Dupe(status=house_number_status, sim=house_number_sim) elif have_street and same_street and not have_house_number: return Dupe(status=street_status, sim=street_sim) return NULL_DUPE