def langsim(fname, lang, threshold, phon, only_hr=False, topk=20): """ Gets a topk list of languages similar to this language, various parameters control this. :param fname: :param lang: :param threshold: :param phon: :param only_hr: :param topk: :return: """ langs = wals.loadLangs(fname) #langs = filter(lambda l: l.nonzerofrac > threshold, langs) tgtlang = None for l in langs: if l["iso_code"].decode("utf8") == lang: tgtlang = l break if tgtlang == None: return [(-1, "Language '{0}' not found...".format(lang.encode("utf8")))] # now filter by high resource # get tgtlang first, b/c it may not be high resource if only_hr: langs = filter(lambda l: l.hr, langs) dists = [] tgtf = tgtlang.phon_feats() for l in langs: if l["iso_code"].decode("utf8") == lang: continue t = l.phon_feats() #dist = cosine(tgtf, t) numequal = sum(np.equal(tgtf, t)) # now remove all places where they are both zero a = set(np.where(tgtf == 0)[0]) b = set(np.where(t == 0)[0]) numequal -= len(a.intersection(b)) dist = 1 - numequal / float(len(tgtf)) dists.append((dist, l.fullname())) dists = sorted(dists) return dists[:topk]
def compareFeats(fname, lang1, lang2): langs = wals.loadLangs(fname) l1 = None l2 = None for lang in langs: if lang["Name"].decode("utf8") == lang1: l1 = lang elif lang["Name"].decode("utf8") == lang2: l2 = lang return l1,l2