コード例 #1
0
ファイル: format.py プロジェクト: bartdw/lingua-franca
def _translate_word(name, lang=''):
    """ Helper to get word translations

    Args:
        name (str): Word name. Returned as the default value if not translated
        lang (str): Language code, e.g. "en-us"

    Returns:
        str: translated version of resource name
    """
    from lingua_franca.internal import resolve_resource_file
    if not lang:
        if lang is None:
            warn(NoneLangWarning)
        lang = get_default_loc()

    lang_code = lang if is_supported_full_lang(lang) else \
        get_full_lang_code(lang)

    filename = resolve_resource_file(join("text", lang_code, name + ".word"))
    if filename:
        # open the file
        try:
            with open(filename, 'r', encoding='utf8') as f:
                for line in f:
                    word = line.strip()
                    if word.startswith("#"):
                        continue  # skip comment lines
                    return word
        except Exception:
            pass
    return name  # use resource name as the word
コード例 #2
0
ファイル: parse_pt.py プロジェクト: ybkimm/lingua-franca
class PortugueseNormalizer(Normalizer):
    with open(resolve_resource_file("text/pt-pt/normalize.json")) as f:
        _default_config = json.load(f)

    @staticmethod
    def tokenize(utterance):
        # Split things like 12%
        utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance)
        # Split things like #1
        utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance)
        # Split things like amo-te
        utterance = re.sub(r"([a-zA-Z]+)(-)([a-zA-Z]+\b)", r"\1 \2 \3",
                           utterance)
        tokens = utterance.split()
        if tokens[-1] == '-':
            tokens = tokens[:-1]

        return tokens