Exemple #1
0
    def get_territory_language_info(self):
        """Get a dictionary of language information for a territory

        Return:
          - the languages
        """
        return languages.get_territory_language_info(self.territory)
Exemple #2
0
def get_country_dict():
    country_dict = {}
    for country in pycountry.countries:
        info = ln.get_territory_language_info(country.alpha_2)
        country_dict[country.name.lower()] = country.name.lower()
        if hasattr(country, "official_name"):
            country_dict[country.official_name.lower()] = country.name.lower()
        for lang in info.keys():
            try:
                locale = Locale(lang, country.alpha_2)
                country_dict[locale.territories[country.alpha_2].lower()] = country.name.lower()
            except:
                pass
    country_dict["USA".lower()] = "United States of America".lower()
    return country_dict
Exemple #3
0
    def __init__(self,
                 country_code=None,
                 text_samples=None,
                 lang=None,
                 tmp_dir=""):

        # Check whether the argument combination is valid
        situation_1 = ((country_code is None) and (text_samples is None)
                       and (lang is not None))
        situation_2 = ((country_code is not None)
                       and (text_samples is not None) and (lang is None))
        assert situation_1 or situation_2, "Invalid argument combination."

        # If the language is not specified then identify it
        if lang is None:
            # Generate the list of official languages for this country
            lang_info = get_territory_language_info(country_code)
            off_langs = [
                lang for lang, info in lang_info.items()
                if info["official_status"] is not None
            ]
            # Check that a language was found
            err_msg = "No official languages found for {}".format(country_code)
            assert len(off_langs) > 0, err_msg

            # Identify the language from multiple samples
            langs = [detect(t) for t in text_samples]

            # Check whether the result is valid
            lang, _ = Counter(langs).most_common()[0]
            err_msg = "Language '{}' not in {}".format(lang, off_langs)
            assert lang in off_langs, err_msg
            print(("Decided that '{}' is the "
                   "language of this text").format(lang))

        # Prepare the zip url and filename strings
        self.zip_url = ("https://s3-us-west-1.amazonaws.com/"
                        "fasttext-vectors/wiki.{}.zip").format(lang)
        self.filename = "wiki.{}.bin".format(lang)
        self.path = os.path.join(tmp_dir, self.filename)
Exemple #4
0
def get_lang_codes_for_territory(territory_code,
                                 min_pop_perc=0.2,
                                 official_status=False):
    """
    Wrapper for babel.languages.get_territory_language_info
    Documentation here: https://github.com/python-babel/babel/blob/master/babel/languages.py#L45 (strange that this function isn't documented on their official site)

    :param territory_code: two letter territory ISO code. If doesn't match anything babel recognizes, returns empty array
    :param min_pop_perc: min population percentage of language usage in territory. stats are likely only mildly accurate but good enough
    :param official_status: the status of the language in the territory. I think this can be 'official', 'de_facto_official', None, 'official_regional'. False means return all.
    
    returns array of ISO lang codes
    """
    from babel import languages
    lang_dict = languages.get_territory_language_info(territory_code)
    langs = [
        lang_code for lang_code, _ in filter(
            lambda x: x[1]['population_percent'] >= (min_pop_perc * 100) and
            ((official_status == False) or x[1]['official_status'] ==
             official_status), lang_dict.items())
    ]
    return langs
Exemple #5
0
def test_get_language_info():
    assert set(get_territory_language_info("HU").keys()) == {
        "hu", "en", "de", "ro", "hr", "sk", "sl"
    }
Exemple #6
0
def test_get_language_info():
    assert set(get_territory_language_info("HU").keys()) == set(("hu", "en", "de", "ro", "hr", "sk", "sl"))
Exemple #7
0
def test_get_language_info():
    assert (
        set(get_territory_language_info("HU")) ==
        {"hu", "fr", "en", "de", "ro", "hr", "sk", "sl"}
    )