def closest_by_sound(klass, search_string, similarity_threshold = 0.8): same = [] similar = [] # metaphones of search term search_sound = dm(search_string) for obj in klass.objects.all(): name_list = str(obj.name).upper().replace('.', ' ').split() surname_guess = max(name_list, key=len) # metaphones of obj name obj_sound = dm(surname_guess) if search_sound[0] == obj_sound[0]: # primary metaphones match exactly same.append((obj, obj_sound, 1.0)) continue else: if search_sound[1] is not None: # see if secondary metaphone of search_sound matches # primary metaphone of obj if search_sound[1] == obj_sound[0]: same.append((obj, obj_sound, 1.0)) continue # no exact match, so see if the primary metaphones are similar primary_sound_dist = jarow(str(search_sound[0]), str(obj_sound[0])) if primary_sound_dist >= similarity_threshold: similar.append((obj, obj_sound, primary_sound_dist)) continue if search_sound[1] is not None: # still dont have a good match. see if secondary metaphone # of search_sound is similar to obj secondary_sound_dist = jarow(str(search_sound[1]), str(obj_sound[0])) if secondary_sound_dist >= similarity_threshold: similar.append((obj, obj_sound, secondary_sound_dist)) if len(same) > 0: return same else: similar.sort(None, operator.itemgetter(2)) # return similar sounding matches if there are no exact matches # limit to top 50 percent if there are more than 5 similar matches if len(similar) > 5: def average(values): return sum(values, 0.0) / len(values) avg_jaro = average([x[2] for x in similar]) above_avg_jaro = [x for x in similar if (x[2] >= avg_jaro)] return search_sound, above_avg_jaro return similar
def calc_dists(mine, theirs): ''' Calculates Levenshtein distance, Damerau-Levenshtein distance, and Jaro-Winkler distance between two strings. Returns a 3-item tuple containing results, respectively. ''' my_str = unicode(mine) search_str = unicode(theirs) # find levenshtein distance lev = distance(my_str, search_str) # find damerau-levenshtein distance dl = dameraulevenshtein(my_str, search_str) # find jaro-winkler distance jw = jarow(my_str, search_str) return (lev, dl, jw)
def reconcile_country(raw_country): country_map = {u'AFGHANISTAN': u'AFG', u'ALBANIA': u'ALB', u'ALGERIA': u'DZA', u'AMERICAN SAMOA': u'ASM', u'ANDORRA': u'AND', u'ANGOLA': u'AGO', u'ANGUILLA': u'AIA', u'ANTARCTICA': None, u'ANTIGUA AND BARBUDA': u'ATG', u'ARGENTINA': u'ARG', u'ARMENIA': u'ARM', u'ARUBA': u'ABW', u'AUSTRALIA': u'AUS', u'AUSTRIA': u'AUT', u'AZERBAIJAN': u'AZE', u'BAHAMAS': u'BHS', u'BAHRAIN': u'BHR', u'BANGLADESH': u'BGD', u'BARBADOS': u'BRB', u'BELARUS': u'BLR', u'BELGIUM': u'BEL', u'BELIZE': u'BLZ', u'BENIN': u'BEN', u'BERMUDA': u'BMU', u'BHUTAN': u'BTN', u'BOLIVIA': u'BOL', u'BOSNIA AND HERZEGOVINA': u'BIH', u'BOTSWANA': u'BWA', u'BOUVET ISLAND': None, u'BRAZIL': u'BRA', u'BRITISH INDIAN OCEAN TERRITORY': None, u'BRUNEI DARUSSALAM': u'BRN', u'BULGARIA': u'BGR', u'BURKINA FASO': u'BFA', u'BURUNDI': u'BDI', u'CAMBODIA': u'KHM', u'CAMEROON': u'CMR', u'CANADA': u'CAN', u'CAPE VERDE': u'CPV', u'CAYMAN ISLANDS': u'CYM', u'CENTRAL AFRICAN REPUBLIC': u'CAF', u'CHAD': u'TCD', u'CHILE': u'CHL', u'CHINA': u'CHN', u'CHRISTMAS ISLAND': None, u'COCOS (KEELING) ISLANDS': None, u'COLOMBIA': u'COL', u'COMOROS': u'COM', u'CONGO': u'COG', u'CONGO, THE DEMOCRATIC REPUBLIC OF THE': u'COD', u'COOK ISLANDS': u'C*K', u'COSTA RICA': u'CRI', u"COTE D'IVOIRE": u'CIV', u'CROATIA': u'HRV', u'CUBA': u'CUB', u'CYPRUS': u'CYP', u'CZECH REPUBLIC': u'CZE', u'DENMARK': u'DNK', u'DJIBOUTI': u'DJI', u'DOMINICA': u'DMA', u'DOMINICAN REPUBLIC': u'DOM', u'ECUADOR': u'ECU', u'EGYPT': u'EGY', u'EL SALVADOR': u'SLV', u'EQUATORIAL GUINEA': u'GNQ', u'ERITREA': u'ERI', u'ESTONIA': u'EST', u'ETHIOPIA': u'ETH', u'FALKLAND ISLANDS (MALVINAS)': u'FLK', u'FAROE ISLANDS': u'FRO', u'FIJI': u'FJI', u'FINLAND': u'FIN', u'FRANCE': u'FRA', u'FRENCH GUIANA': u'GUF', u'FRENCH POLYNESIA': u'PYF', u'FRENCH SOUTHERN TERRITORIES': None, u'GABON': u'GAB', u'GAMBIA': u'GMB', u'GEORGIA': u'GEO', u'GERMANY': u'DEU', u'GHANA': u'GHA', u'GIBRALTAR': u'GIB', u'GREECE': u'GRC', u'GREENLAND': u'GRL', u'GRENADA': u'GRD', u'GUADELOUPE': u'GLP', u'GUAM': u'GUM', u'GUATEMALA': u'GTM', u'GUINEA': u'GIN', u'GUINEA-BISSAU': u'GNB', u'GUYANA': u'GUY', u'HAITI': u'HTI', u'HEARD ISLAND AND MCDONALD ISLANDS': None, u'HOLY SEE (VATICAN CITY STATE)': u'VAT', u'HONDURAS': u'HND', u'HONG KONG': u'HKG', u'HUNGARY': u'HUN', u'ICELAND': u'ISL', u'INDIA': u'IND', u'INDONESIA': u'IDN', u'IRAN, ISLAMIC REPUBLIC OF': u'IRN', u'IRAQ': u'IRQ', u'IRELAND': u'IRL', u'ISRAEL': u'ISR', u'ITALY': u'ITA', u'JAMAICA': u'JAM', u'JAPAN': u'JPN', u'JORDAN': u'JOR', u'KAZAKHSTAN': u'KAZ', u'KENYA': u'KEN', u'KIRIBATI': u'KIR', u"KOREA, DEMOCRATIC PEOPLE'S REPUBLIC OF": u'PRK', u'KOREA, REPUBLIC OF': u'KOR', u'KUWAIT': u'KWT', u'KYRGYZSTAN': u'KGZ', u"LAO PEOPLE'S DEMOCRATIC REPUBLIC": u'LAO', u'LATVIA': u'LVA', u'LEBANON': u'LBN', u'LESOTHO': u'LSO', u'LIBERIA': u'LBR', u'LIBYAN ARAB JAMAHIRIYA': u'LBY', u'LIECHTENSTEIN': u'LIE', u'LITHUANIA': u'LTU', u'LUXEMBOURG': u'LUX', u'MACAO': u'MAC', u'MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF': u'MKD', u'MADAGASCAR': u'MDG', u'MALAWI': u'MWI', u'MALAYSIA': u'MYS', u'MALDIVES': u'MDV', u'MALI': u'MLI', u'MALTA': u'MLT', u'MARSHALL ISLANDS': u'MHL', u'MARTINIQUE': u'MTQ', u'MAURITANIA': u'MRT', u'MAURITIUS': u'MUS', u'MAYOTTE': None, u'MEXICO': u'MEX', u'MICRONESIA, FEDERATED STATES OF': u'FSM', u'MOLDOVA, REPUBLIC OF': u'MDA', u'MONACO': u'MCO', u'MONGOLIA': u'MNG', u'MONTSERRAT': u'MSR', u'MOROCCO': u'MAR', u'MOZAMBIQUE': u'MOZ', u'MYANMAR': u'MMR', u'NAMIBIA': u'NAM', u'NAURU': u'NRU', u'NEPAL': u'NPL', u'NETHERLANDS': u'NLD', u'NETHERLANDS ANTILLES': u'ANT', u'NEW CALEDONIA': u'NCL', u'NEW ZEALAND': u'NZL', u'NICARAGUA': u'NIC', u'NIGER': u'NER', u'NIGERIA': u'NGA', u'NIUE': u'NIU', u'NORFOLK ISLAND': u'NFK', u'NORTHERN MARIANA ISLANDS': u'MNP', u'NORWAY': u'NOR', u'OMAN': u'OMN', u'PAKISTAN': u'PAK', u'PALAU': u'PLW', u'PALESTINIAN TERRITORY, OCCUPIED': None, u'PANAMA': u'PAN', u'PAPUA NEW GUINEA': u'PNG', u'PARAGUAY': u'PRY', u'PERU': u'PER', u'PHILIPPINES': u'PHL', u'PITCAIRN': u'PCN', u'POLAND': u'POL', u'PORTUGAL': u'PRT', u'PUERTO RICO': u'PRI', u'QATAR': u'QAT', u'REUNION': u'REU', u'ROMANIA': u'ROM', u'RUSSIAN FEDERATION': u'RUS', u'RWANDA': u'RWA', u'SAINT HELENA': u'SHN', u'SAINT KITTS AND NEVIS': u'KNA', u'SAINT LUCIA': u'LCA', u'SAINT PIERRE AND MIQUELON': u'SPM', u'SAINT VINCENT AND THE GRENADINES': u'VCT', u'SAMOA': u'WSM', u'SAN MARINO': u'SMR', u'SAO TOME AND PRINCIPE': u'STP', u'SAUDI ARABIA': u'SAU', u'SENEGAL': u'SEN', u'SERBIA AND MONTENEGRO': None, u'SEYCHELLES': u'SYC', u'SIERRA LEONE': u'SLE', u'SINGAPORE': u'SGP', u'SLOVAKIA': u'SVK', u'SLOVENIA': u'SVN', u'SOLOMON ISLANDS': u'SLB', u'SOMALIA': u'SOM', u'SOUTH AFRICA': u'ZAF', u'SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS': None, u'SPAIN': u'ESP', u'SRI LANKA': u'LKA', u'SUDAN': u'SDN', u'SURINAME': u'SUR', u'SVALBARD AND JAN MAYEN': u'SJM', u'SWAZILAND': u'SWZ', u'SWEDEN': u'SWE', u'SWITZERLAND': u'CHE', u'SYRIAN ARAB REPUBLIC': u'SYR', u'TAIWAN, PROVINCE OF CHINA': u'TWN', u'TAJIKISTAN': u'TJK', u'TANZANIA, UNITED REPUBLIC OF': u'TZA', u'THAILAND': u'THA', u'TIMOR-LESTE': None, u'TOGO': u'TGO', u'TOKELAU': u'TKL', u'TONGA': u'TON', u'TRINIDAD AND TOBAGO': u'TTO', u'TUNISIA': u'TUN', u'TURKEY': u'TUR', u'TURKMENISTAN': u'TKM', u'TURKS AND CAICOS ISLANDS': u'TCA', u'TUVALU': u'TUV', u'UGANDA': u'UGA', u'UKRAINE': u'UKR', u'UNITED ARAB EMIRATES': u'ARE', u'UNITED KINGDOM': u'GBR', u'UNITED STATES': u'USA', u'UNITED STATES MINOR OUTLYING ISLANDS': None, u'URUGUAY': u'URY', u'UZBEKISTAN': u'UZB', u'VANUATU': u'VUT', u'VENEZUELA': u'VEN', u'VIET NAM': u'VNM', u'VIRGIN ISLANDS, BRITISH': u'VGB', u'VIRGIN ISLANDS, U.S.': u'VIR', u'WALLIS AND FUTUNA': u'WLF', u'WESTERN SAHARA': u'ESH', u'YEMEN': u'YEM', u'ZAMBIA': u'ZMB', u'ZIMBABWE': u'ZWE'} # check if term is a key in country_map if raw_country.upper() in country_map: return True, country_map[raw_country.upper()] search_sound = dm(unicode(raw_country)) suggestions = [] for c in country_map.keys(): country_sound = dm(unicode(c)) if search_sound[0] == country_sound[0]: suggestions.append((1.0, raw_country, c, country_map[c])) continue else: if search_sound[1] is not None: # see if secondary metaphone of search_sound matches # primary metaphone of obj if search_sound[1] == country_sound[0]: suggestions.append((1.0, raw_country, c, country_map[c])) continue # no exact match, so see if the primary metaphones are similar primary_sound_dist = jarow(str(search_sound[0]), str(country_sound[0])) if primary_sound_dist >= 2: similar.append((primary_sound_dist, raw_country, c, country_map[c])) continue return False, {'double-metaphone': suggestions}