コード例 #1
0
def lang_pair(string) -> LangPair:
    parts = string.strip().split('-')
    if len(parts) != 2:
        msg = f'expected value of form "xxx-yyz" eg "deu-eng"; given {string}'
        raise Exception(msg)
    std_codes = (bcp47(parts[0]), bcp47(parts[1]))
    std_form = '-'.join(str(lang) for lang in std_codes)
    if std_form != string:
        log.info(
            f"Suggestion: Use codes {std_form} instead of {string}."
            f" Let's make a little space for all languages of our planet 😢."
        )
    return std_codes
コード例 #2
0
def parse_tmx(data, log_every=DEF_PROGRESS):
    context = ET.iterparse(data, events=['end'])
    tus = (el for event, el in context if el.tag == 'tu')
    count = 0
    st = t = time.time()
    for tu in tus:
        lang_seg = {}
        for tuv in tu.findall('tuv'):
            lang = [v for k, v in tuv.attrib.items() if k.endswith('lang')]
            seg = tuv.findtext('seg')
            if lang and seg:
                lang = bcp47(lang[0])
                seg = unescape(seg.strip()).replace('\n',
                                                    ' ').replace('\t', ' ')
                if lang in lang_seg:
                    log.warning(
                        f"Language {lang} appears twice in same translation unit."
                    )
                lang_seg[lang] = seg
        yield lang_seg
        count += 1
        if log_every and (time.time() - t) > log_every:
            elapsed = datetime.timedelta(seconds=round(time.time() - st))
            log.info(f"{elapsed} :: Parsed: {count:,}")
            t = time.time()
        tu.clear()
コード例 #3
0
ファイル: data.py プロジェクト: thammegowda/mtdata
 def find_bitext_pairs(cls, dir_path: Path, lang1: BCP47Tag,
                       lang2: BCP47Tag):
     if lang1.is_compatible(lang2):
         raise Exception(
             f"Unable to merge for {lang1}-{lang2}; it can result in unpredictable behavior."
         )
     paired_files = {}
     for path in dir_path.glob("*.*"):
         if path.name.startswith("."):
             continue
         parts = path.name.split(".")
         assert len(
             parts
         ) >= 2, f'Invalid file name {path.name}; Unable to merge parts'
         if parts[-1] == DEF_COMPRESS:
             parts = parts[:-1]
         *did, ext = parts  # did can have a dot e.g. version 7.1
         did = '.'.join(did)
         # dids, ext = parts[:2]
         ext = bcp47(ext)
         if did not in paired_files:
             paired_files[did] = [None, None]
         if lang1.is_compatible(ext):
             assert not lang2.is_compatible(ext)
             paired_files[did][0] = path
         elif lang2.is_compatible(ext):
             paired_files[did][1] = path
         else:
             raise Exception(
                 f"Unable to decide the side of train-part {path}; ext={ext}: we have {lang1}-{lang2}"
             )
     for did, (f1, f2) in paired_files.items():
         assert f1 and f1.exists(
         ), f'Invalid state: part {did} does not have pair, or pair is are removed'
     return paired_files
コード例 #4
0
ファイル: __init__.py プロジェクト: thammegowda/mtdata
 def get_entry(self, name, langs):
     assert isinstance(name, str)
     assert isinstance(langs, tuple)
     langs = tuple(lang if isinstance(lang, BCP47Tag) else bcp47(lang)
                   for lang in langs)
     key = (name, langs)
     rev_key = (name, tuple(reversed(langs)))
     if key not in self.entries and rev_key in self.entries:
         key = rev_key
     return self.entries[key]
コード例 #5
0
 def __post_init__(self):
     assert self.group
     assert self.name
     assert self.version
     assert self.name.islower(
     ), f'name {self.name} has to be lower cased for consistency'
     for name in [self.group, self.version, self.name]:
         for ch in '-/*|[](){}<>?&:;,!^$"\' ':
             assert ch not in name, f"Character '{ch}' is not permitted in name {name}"
     # ensure lang ID is BCP47 tag
     assert isinstance(
         self.langs, tuple), f'Expected tuple (l1, l2); given={self.langs}'
     langs = tuple(lang if isinstance(lang, BCP47Tag) else bcp47(lang)
                   for lang in self.langs)
     if langs != self.langs:
         object.__setattr__(self, 'langs', langs)  # bypass frozen=True
コード例 #6
0
def read_tmx(path: Union[Path, str], langs=None):
    """
    reads a TMX file as records
    :param path: path to .tmx file
    :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect
    :return: stream of (text1, text2)
    """
    passes = 0
    fails = 0
    if langs:
        assert len(langs) == 2
        langs = [bcp47(lang) for lang in langs]
        assert not BCP47Tag.are_compatible(
            *langs), f'{langs} expected to be different (/unambiguous)'
    with IO.reader(path) as data:
        recs = parse_tmx(data)
        for lang_seg in recs:
            if langs is None:
                log.warning(
                    "langs not set; this could result in language mismatch")
                if len(lang_seg) == 2:
                    langs = tuple(lang_seg.keys())
                else:
                    raise Exception(
                        f"Language autodetect for TMX only supports 2 languages,"
                        f" but provided with {lang_seg.keys()} in TMX {path}")
            seg1, seg2 = None, None
            for lang, seg in lang_seg.items():
                if BCP47Tag.are_compatible(langs[0], lang):
                    seg1 = seg
                elif BCP47Tag.are_compatible(langs[1], lang):
                    seg2 = seg
                # else ignore
            if seg1 and seg2:  # both segs are found
                yield seg1, seg2
                passes += 1
            else:
                fails += 1
    if passes == 0:
        if fails == 0:
            raise Exception(f"Empty TMX {path}")
        raise Exception(f"Nothing for {langs[0]}-{langs[1]} in TMX {path}")
    if fails != 0:
        log.warning(
            f"Skipped {fails} entries due to language mismatch in TMX {path}")
    log.info(f"Extracted {passes} pairs from TMX {path}")
コード例 #7
0
ファイル: test_bcp47.py プロジェクト: thammegowda/mtdata
def test_bcp47():
    assert bcp47("en-GB")[:3] == ('eng', None, 'GB')
    assert bcp47("en-GB") == ('eng', None, 'GB', 'eng_GB')
    assert bcp47("en") == ('eng', None, None, 'eng')
    assert bcp47("en-IN") == ('eng', None, 'IN', 'eng_IN')
    assert bcp47("en-US") == ('eng', None, 'US', 'eng_US')
    # Latn script is default, so None
    assert bcp47("en-Latn") == ('eng', None, None, 'eng')
    # hypothetical;
    assert bcp47("en-Knda") == ('eng', 'Knda', None, 'eng_Knda')
    assert bcp47("en-Latn-GB") == ('eng', None, 'GB', 'eng_GB')

    try:
        bcp47("en-Latn-UK")
        fail("UK is not ISO country code")
    except ValueError:
        pass  # expected

    assert bcp47("kn") == ('kan', None, None, 'kan')
    assert bcp47("kn-Knda") == ('kan', None, None, 'kan')  # default script
    assert bcp47("kn-Knda-IN") == ('kan', None, 'IN', 'kan_IN')
    assert bcp47("kn_Knda_IN") == ('kan', None, 'IN', 'kan_IN')
    assert bcp47("kn_Knda_in") == ('kan', None, 'IN', 'kan_IN')
    assert bcp47("kn_Latn_IN") == ('kan', 'Latn', 'IN', 'kan_Latn_IN')
    assert bcp47("kn_Deva_IN") == ('kan', 'Deva', 'IN', 'kan_Deva_IN')
    assert bcp47("hi_Deva_IN") == ('hin', None, 'IN', 'hin_IN'
                                   )  # default script

    assert bcp47("pt_PT") == ('por', None, 'PT', 'por_PT')
    assert bcp47("pt_pt") == ('por', None, 'PT', 'por_PT')
    assert bcp47("pt_BR") == ('por', None, 'BR', 'por_BR')
    assert bcp47("pt_br") == ('por', None, 'BR', 'por_BR')
    assert bcp47("pt_Latn_br") == ('por', None, 'BR', 'por_BR'
                                   )  # default script
    assert bcp47("pt_Cyrl_br") == ('por', 'Cyrl', 'BR', 'por_Cyrl_BR'
                                   )  # non default script
    assert bcp47("fr") == ('fra', None, None, 'fra')
    assert bcp47("fr-CA") == ('fra', None, 'CA', 'fra_CA')
コード例 #8
0
ファイル: test_bcp47.py プロジェクト: thammegowda/mtdata
def test_py_obj_model():
    """Test cases for python object model"""
    assert bcp47("en-GB") is not bcp47(
        "en_GB")  # object comparison: two objects are two different references
    assert bcp47("en-GB") == bcp47(
        "en_GB")  # but they have same value and hence equal
    assert bcp47("en-GB") == bcp47("en_Latn_GB")  # ignore the default script
    assert bcp47("en-Latn-US") == bcp47("en_US")  # ignore default script again
    assert bcp47("en-Latn") == bcp47("en")  # ignore default script again
    assert bcp47("en-Latn") == bcp47("english")  # ignore default script again
    assert bcp47("en-Latn") == bcp47("English")  # ignore default script again
    assert bcp47("en-Latn") == bcp47("ENG")  # ignore default script again
    assert bcp47("en-US") != bcp47("en")  # dont ignore region

    # Custom class, e.g. BCP47Tag, instead of plain old str obj could create bugs due to improper hashing;
    # so test it out
    mem = set()
    assert bcp47('en') not in mem
    mem.add(bcp47('en'))
    assert bcp47('en') in mem
    mem.add(bcp47('en'))
    assert len(mem) == 1  # dupes are removed
    mem.add(bcp47('en-Latn'))
    mem.add(bcp47('english'))
    mem.add(bcp47('English'))
    mem.add(bcp47('eng'))
    assert len(mem) == 1  # dupes are removed

    mem = dict()
    mem[bcp47('en')] = 10
    assert mem[bcp47('en')] == 10
    assert mem[bcp47('en')] != 11
    assert mem[bcp47('eng')] == 10
    assert mem[bcp47('english')] == 10
    assert mem[bcp47('English-Latn')] == 10
コード例 #9
0
ファイル: __init__.py プロジェクト: thammegowda/mtdata
def is_compatible(lang1: Union[str, BCP47Tag], lang2: Union[str, BCP47Tag]):
    lang1 = lang1 if isinstance(lang1, BCP47Tag) else bcp47(lang1)
    lang2 = lang2 if isinstance(lang2, BCP47Tag) else bcp47(lang2)
    return lang1.is_compatible(lang2)