def closest_to(klass, term): try: # words to exclude when attempting to match country names exclude = set(["vaccine"]) big_term = term.lower() vax_tuples = [] for obj in klass.objects.all(): big_en = "" big_fr = "" if obj.abbr_en is not None: big_en = obj.abbr_en.lower() # calculate edit distance between word from term and word from english name vax_tuples.append((dm(big_term, big_en), obj, obj.abbr_en)) if obj.abbr_fr is not None: big_fr = obj.abbr_fr.lower() if big_fr != big_en: # calculate edit distance between word from term and word from french name vax_tuples.append((dm(big_term, big_fr), obj, obj.abbr_fr)) # sort tuples by ascending edit distance, pluck the 5 closest matches closest = sorted(vax_tuples, key=itemgetter(0))[:10] # return only the objects return set(map(itemgetter(1), closest)) except Exception, e: print 'BANG' print e
def country_aware_closest_to(klass, term, country_pk): try: big_term = term.lower() # grrr edge cases big_term = big_term.replace('meningitis', 'mening') big_term = big_term.replace('measles', 'mea') countrystocks = CountryStock.objects.filter(country=country_pk) # fetch all vaccines that this country has stocks of cs_vaccines = map(attrgetter('vaccine'), countrystocks) # get list of vaccines who's name and/or any abbreviations # contain any words in the given term vax_partials = [obj for obj in klass.objects.all()\ if not set(_split_term(big_term)).isdisjoint(set(obj._field_list()))] if len(vax_partials) > 0: vax_partials_groups = set(map(attrgetter('group'), vax_partials)) vax_relatives = list(itertools.chain(*[g.vaccine_set.all() for g in vax_partials_groups])) only_objs = set(vax_partials).union(set(vax_relatives)) # if there are no overlapping terms, try to find # similar vaccines by edit distance else: vax_tuples = [] for obj in klass.objects.all(): big_en = "" big_fr = "" if obj.abbr_en is not None: big_en = obj.abbr_en.lower() # calculate edit distance between word from term and word from english name vax_tuples.append((dm(big_term, big_en), obj, obj.abbr_en)) if obj.abbr_fr is not None: big_fr = obj.abbr_fr.lower() if big_fr != big_en: # calculate edit distance between word from term and word from french name vax_tuples.append((dm(big_term, big_fr), obj, obj.abbr_fr)) # sort tuples by ascending edit distance, pluck closest matches closest = sorted(vax_tuples, key=itemgetter(0))[:10] # return only the objects only_objs = set(map(itemgetter(1), closest)).union(vax_partials) # see if any closest matches are vaccines this country stocks if only_objs.isdisjoint(set(cs_vaccines)): # if not, return top matches and None return list(only_objs)[:20], None else: # if so, return the ones that country doesnt stock and the ones that the country does stock return only_objs.difference(set(cs_vaccines)), only_objs.intersection(set(cs_vaccines)) except Exception, e: print 'BANG country aware closest to' print e
def closest_to(klass, term): try: # grr edge cases... if 'OPT' in term.upper(): term = 'palestinian' if 'PALESTINE' in term.upper(): term = 'palestinian' if 'GAZA' in term.upper(): term = 'palestinian' if "SERBIA" in term.upper(): term = 'serbia and montenegro' if "LIBYA" in term.upper(): term = 'libyan arab JAMAHIRIYA' if "DRC" in term.upper(): term = 'congo' if "IVORY" in term.upper(): term = "COTE D'IVOIRE" if "VERT" in term.upper(): term = "cape verde" if "PACIFIC" in term.upper(): term = "fiji" # words to exclude when attempting to match country names exclude = set(["democratic", "peoples", "republic", "the", "of", "american", "french", "brazzaville", "islamic", "people's", "territory", "kingdom", "démocratique", "république", "territories", "françaises", "française", "islands", "british", "britannique", "américaines", "britanniques", "western", "occidental", "république-unie", "république", "l'ex-république", "démocratique", "equatorial", "équatoriale", "territoire", "plurinational", "américaines", "conakry", "states", "états", "outlying", "éloignées", "federation", "fédération", "pays", "sultanate"]) # replace hyphens, exclude any words from exclude set, and pluck the longest remaining word big_term = max(set(term.lower().replace('-', ' ').split()).difference(exclude), key=len) country_tuples = [] for obj in klass.objects.all(): big_en = "" big_fr = "" if obj.name is not None: # pluck longest word in english name that does not appear in exclude big_en = max(set(word.lower().strip(",") for word in obj.name.replace('-', ' ').split()).difference(exclude), key=len) # calculate edit distance between word from term and word from english name country_tuples.append((dm(big_term, big_en), obj, obj.name)) if obj.name_fr is not None: # pluck longest word in french name that does not appear in exclude big_fr = max(set(word.lower().strip(",") for word in obj.name_fr.replace('-', ' ').split()).difference(exclude), key=len) if big_fr != big_en: # calculate edit distance between word from term and word from french name country_tuples.append((dm(big_term, big_fr), obj, obj.name_fr)) # sort tuples by ascending edit distance, pluck the 5 closest matches closest = sorted(country_tuples, key=itemgetter(0))[:5] # return only the objects return set(map(itemgetter(1), closest)) except Exception, e: print 'BANG' print e