def compare(langid1, langid2, langdists): """ Get script similarity between languages. This just retrieves :class:`utils.Language` objects that have ISO codes of `langid1` and `langid2`, then calls :func:`simdist`. If the languages are not present in `langdists`, then the returned score is -1. :param langid1: 3-letter language code :param langid2: 3-letter language code :param langdists: output from :func:`wikidatastats.loaddump` :return: a score of script similarity """ # FIXME: these should be loaded somewhere else for efficiency. three2two = utils.getlangmap() langid1 = three2two[langid1] langid2 = three2two[langid2] if langid1 not in langdists: print langid1, "not in langdists." return -1 if langid2 not in langdists: print langid2, "not in langdists." return -1 l1 = langdists[langid1] l2 = langdists[langid2] return simdist(l1.charfreqs, l2.charfreqs)
def getclosest(lang, langdists): """ This calculates script similarities between `lang` and all other languages in `langdists`. :param lang: 3-letter language code :param langdists: output from :func:`wikidatastats.loaddump` :return: a map of form {langcode : float, ...} """ three2two = utils.getlangmap() lang2 = three2two[lang] langobj = langdists[lang2] d1 = langobj.charfreqs chardists = {} for langcode in langdists: if langcode == lang: continue l2obj = langdists[langcode] chardists[langcode] = simdist(d1, l2obj.charfreqs) return chardists
def sim_overall_closest(l1, lambda1=1./3, lambda2=1./3, lambda3=1./3): """ Given a language, this gets a list of close languages. :param l1: :param lambda1: :param lambda2: :param lambda3: :return: """ phlangs,sp = sim_phon_closest(l1) # the keys to this are wikipedia langids! wikilangs,ss = sim_script_closest(l1) walslangs,sg = sim_gen_closest(l1) three2two = utils.getlangmap() ret = [] # loop over languages in phoible set (which are 3 char) for p in sp: # skip if we can't get it into 2 char if p in three2two: p2 = three2two[p] else: continue # if the 2 char is in the wiki name list, and the 3 char is in the wals list, we're good! if p2 in ss and p in sg: phlang = utils.Language() phlang.iso3 = p ret.append((lambda1 * sp[p] + lambda2*ss[p2] + lambda3*sg[p], sp[p], ss[p2], sg[p], phlang)) ret = sorted(ret, reverse=True) return ret