Ejemplo n.º 1
0
def compare(langid1, langid2, langdists):
    """
    Get script similarity between languages. This just retrieves :class:`utils.Language` objects that have
    ISO codes of `langid1` and `langid2`, then calls :func:`simdist`.

    If the languages are not present in `langdists`, then the returned score is -1.

    :param langid1: 3-letter language code
    :param langid2: 3-letter language code
    :param langdists: output from :func:`wikidatastats.loaddump`
    :return: a score of script similarity
    """

    # FIXME: these should be loaded somewhere else for efficiency.
    three2two = utils.getlangmap()

    langid1 = three2two[langid1]
    langid2 = three2two[langid2]

    if langid1 not in langdists:
        print langid1, "not in langdists."
        return -1
    if langid2 not in langdists:
        print langid2, "not in langdists."
        return -1

    l1 = langdists[langid1]
    l2 = langdists[langid2]

    return simdist(l1.charfreqs, l2.charfreqs)
Ejemplo n.º 2
0
def compare(langid1, langid2, langdists):
    """
    Get script similarity between languages. This just retrieves :class:`utils.Language` objects that have
    ISO codes of `langid1` and `langid2`, then calls :func:`simdist`.

    If the languages are not present in `langdists`, then the returned score is -1.

    :param langid1: 3-letter language code
    :param langid2: 3-letter language code
    :param langdists: output from :func:`wikidatastats.loaddump`
    :return: a score of script similarity
    """

    # FIXME: these should be loaded somewhere else for efficiency.
    three2two = utils.getlangmap()

    langid1 = three2two[langid1]
    langid2 = three2two[langid2]

    if langid1 not in langdists:
        print langid1, "not in langdists."
        return -1
    if langid2 not in langdists:
        print langid2, "not in langdists."
        return -1

    l1 = langdists[langid1]
    l2 = langdists[langid2]

    return simdist(l1.charfreqs, l2.charfreqs)
Ejemplo n.º 3
0
def getclosest(lang, langdists):
    """
    This calculates script similarities between `lang` and all other languages in `langdists`.

    :param lang: 3-letter language code
    :param langdists: output from :func:`wikidatastats.loaddump`
    :return: a map of form {langcode : float, ...}
    """

    three2two = utils.getlangmap()
    lang2 = three2two[lang]

    langobj = langdists[lang2]
    d1 = langobj.charfreqs

    chardists = {}
    for langcode in langdists:
        if langcode == lang:
            continue

        l2obj = langdists[langcode]

        chardists[langcode] = simdist(d1, l2obj.charfreqs)

    return chardists
Ejemplo n.º 4
0
def getclosest(lang, langdists):
    """
    This calculates script similarities between `lang` and all other languages in `langdists`.

    :param lang: 3-letter language code
    :param langdists: output from :func:`wikidatastats.loaddump`
    :return: a map of form {langcode : float, ...}
    """

    three2two = utils.getlangmap()
    lang2 = three2two[lang]

    langobj = langdists[lang2]
    d1 = langobj.charfreqs

    chardists = {}
    for langcode in langdists:
        if langcode == lang:
            continue

        l2obj = langdists[langcode]

        chardists[langcode] = simdist(d1, l2obj.charfreqs)

    return chardists
Ejemplo n.º 5
0
def sim_overall_closest(l1, lambda1=1./3, lambda2=1./3, lambda3=1./3):
    """
    Given a language, this gets a list of close languages.

    :param l1:
    :param lambda1:
    :param lambda2:
    :param lambda3:
    :return:
    """

    phlangs,sp = sim_phon_closest(l1)

    # the keys to this are wikipedia langids!
    wikilangs,ss = sim_script_closest(l1)
    walslangs,sg = sim_gen_closest(l1)

    three2two = utils.getlangmap()

    ret = []

    # loop over languages in phoible set (which are 3 char)
    for p in sp:
        # skip if we can't get it into 2 char
        if p in three2two:
            p2 = three2two[p]
        else:
            continue

        # if the 2 char is in the wiki name list, and the 3 char is in the wals list, we're good!
        if p2 in ss and p in sg:
            phlang = utils.Language()
            phlang.iso3 = p
            ret.append((lambda1 * sp[p] + lambda2*ss[p2] + lambda3*sg[p], sp[p], ss[p2], sg[p], phlang))

    ret = sorted(ret, reverse=True)

    return ret