def lang_pair(string) -> LangPair: parts = string.strip().split('-') if len(parts) != 2: msg = f'expected value of form "xxx-yyz" eg "deu-eng"; given {string}' raise Exception(msg) std_codes = (bcp47(parts[0]), bcp47(parts[1])) std_form = '-'.join(str(lang) for lang in std_codes) if std_form != string: log.info( f"Suggestion: Use codes {std_form} instead of {string}." f" Let's make a little space for all languages of our planet 😢." ) return std_codes
def parse_tmx(data, log_every=DEF_PROGRESS): context = ET.iterparse(data, events=['end']) tus = (el for event, el in context if el.tag == 'tu') count = 0 st = t = time.time() for tu in tus: lang_seg = {} for tuv in tu.findall('tuv'): lang = [v for k, v in tuv.attrib.items() if k.endswith('lang')] seg = tuv.findtext('seg') if lang and seg: lang = bcp47(lang[0]) seg = unescape(seg.strip()).replace('\n', ' ').replace('\t', ' ') if lang in lang_seg: log.warning( f"Language {lang} appears twice in same translation unit." ) lang_seg[lang] = seg yield lang_seg count += 1 if log_every and (time.time() - t) > log_every: elapsed = datetime.timedelta(seconds=round(time.time() - st)) log.info(f"{elapsed} :: Parsed: {count:,}") t = time.time() tu.clear()
def find_bitext_pairs(cls, dir_path: Path, lang1: BCP47Tag, lang2: BCP47Tag): if lang1.is_compatible(lang2): raise Exception( f"Unable to merge for {lang1}-{lang2}; it can result in unpredictable behavior." ) paired_files = {} for path in dir_path.glob("*.*"): if path.name.startswith("."): continue parts = path.name.split(".") assert len( parts ) >= 2, f'Invalid file name {path.name}; Unable to merge parts' if parts[-1] == DEF_COMPRESS: parts = parts[:-1] *did, ext = parts # did can have a dot e.g. version 7.1 did = '.'.join(did) # dids, ext = parts[:2] ext = bcp47(ext) if did not in paired_files: paired_files[did] = [None, None] if lang1.is_compatible(ext): assert not lang2.is_compatible(ext) paired_files[did][0] = path elif lang2.is_compatible(ext): paired_files[did][1] = path else: raise Exception( f"Unable to decide the side of train-part {path}; ext={ext}: we have {lang1}-{lang2}" ) for did, (f1, f2) in paired_files.items(): assert f1 and f1.exists( ), f'Invalid state: part {did} does not have pair, or pair is are removed' return paired_files
def get_entry(self, name, langs): assert isinstance(name, str) assert isinstance(langs, tuple) langs = tuple(lang if isinstance(lang, BCP47Tag) else bcp47(lang) for lang in langs) key = (name, langs) rev_key = (name, tuple(reversed(langs))) if key not in self.entries and rev_key in self.entries: key = rev_key return self.entries[key]
def __post_init__(self): assert self.group assert self.name assert self.version assert self.name.islower( ), f'name {self.name} has to be lower cased for consistency' for name in [self.group, self.version, self.name]: for ch in '-/*|[](){}<>?&:;,!^$"\' ': assert ch not in name, f"Character '{ch}' is not permitted in name {name}" # ensure lang ID is BCP47 tag assert isinstance( self.langs, tuple), f'Expected tuple (l1, l2); given={self.langs}' langs = tuple(lang if isinstance(lang, BCP47Tag) else bcp47(lang) for lang in self.langs) if langs != self.langs: object.__setattr__(self, 'langs', langs) # bypass frozen=True
def read_tmx(path: Union[Path, str], langs=None): """ reads a TMX file as records :param path: path to .tmx file :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect :return: stream of (text1, text2) """ passes = 0 fails = 0 if langs: assert len(langs) == 2 langs = [bcp47(lang) for lang in langs] assert not BCP47Tag.are_compatible( *langs), f'{langs} expected to be different (/unambiguous)' with IO.reader(path) as data: recs = parse_tmx(data) for lang_seg in recs: if langs is None: log.warning( "langs not set; this could result in language mismatch") if len(lang_seg) == 2: langs = tuple(lang_seg.keys()) else: raise Exception( f"Language autodetect for TMX only supports 2 languages," f" but provided with {lang_seg.keys()} in TMX {path}") seg1, seg2 = None, None for lang, seg in lang_seg.items(): if BCP47Tag.are_compatible(langs[0], lang): seg1 = seg elif BCP47Tag.are_compatible(langs[1], lang): seg2 = seg # else ignore if seg1 and seg2: # both segs are found yield seg1, seg2 passes += 1 else: fails += 1 if passes == 0: if fails == 0: raise Exception(f"Empty TMX {path}") raise Exception(f"Nothing for {langs[0]}-{langs[1]} in TMX {path}") if fails != 0: log.warning( f"Skipped {fails} entries due to language mismatch in TMX {path}") log.info(f"Extracted {passes} pairs from TMX {path}")
def test_bcp47(): assert bcp47("en-GB")[:3] == ('eng', None, 'GB') assert bcp47("en-GB") == ('eng', None, 'GB', 'eng_GB') assert bcp47("en") == ('eng', None, None, 'eng') assert bcp47("en-IN") == ('eng', None, 'IN', 'eng_IN') assert bcp47("en-US") == ('eng', None, 'US', 'eng_US') # Latn script is default, so None assert bcp47("en-Latn") == ('eng', None, None, 'eng') # hypothetical; assert bcp47("en-Knda") == ('eng', 'Knda', None, 'eng_Knda') assert bcp47("en-Latn-GB") == ('eng', None, 'GB', 'eng_GB') try: bcp47("en-Latn-UK") fail("UK is not ISO country code") except ValueError: pass # expected assert bcp47("kn") == ('kan', None, None, 'kan') assert bcp47("kn-Knda") == ('kan', None, None, 'kan') # default script assert bcp47("kn-Knda-IN") == ('kan', None, 'IN', 'kan_IN') assert bcp47("kn_Knda_IN") == ('kan', None, 'IN', 'kan_IN') assert bcp47("kn_Knda_in") == ('kan', None, 'IN', 'kan_IN') assert bcp47("kn_Latn_IN") == ('kan', 'Latn', 'IN', 'kan_Latn_IN') assert bcp47("kn_Deva_IN") == ('kan', 'Deva', 'IN', 'kan_Deva_IN') assert bcp47("hi_Deva_IN") == ('hin', None, 'IN', 'hin_IN' ) # default script assert bcp47("pt_PT") == ('por', None, 'PT', 'por_PT') assert bcp47("pt_pt") == ('por', None, 'PT', 'por_PT') assert bcp47("pt_BR") == ('por', None, 'BR', 'por_BR') assert bcp47("pt_br") == ('por', None, 'BR', 'por_BR') assert bcp47("pt_Latn_br") == ('por', None, 'BR', 'por_BR' ) # default script assert bcp47("pt_Cyrl_br") == ('por', 'Cyrl', 'BR', 'por_Cyrl_BR' ) # non default script assert bcp47("fr") == ('fra', None, None, 'fra') assert bcp47("fr-CA") == ('fra', None, 'CA', 'fra_CA')
def test_py_obj_model(): """Test cases for python object model""" assert bcp47("en-GB") is not bcp47( "en_GB") # object comparison: two objects are two different references assert bcp47("en-GB") == bcp47( "en_GB") # but they have same value and hence equal assert bcp47("en-GB") == bcp47("en_Latn_GB") # ignore the default script assert bcp47("en-Latn-US") == bcp47("en_US") # ignore default script again assert bcp47("en-Latn") == bcp47("en") # ignore default script again assert bcp47("en-Latn") == bcp47("english") # ignore default script again assert bcp47("en-Latn") == bcp47("English") # ignore default script again assert bcp47("en-Latn") == bcp47("ENG") # ignore default script again assert bcp47("en-US") != bcp47("en") # dont ignore region # Custom class, e.g. BCP47Tag, instead of plain old str obj could create bugs due to improper hashing; # so test it out mem = set() assert bcp47('en') not in mem mem.add(bcp47('en')) assert bcp47('en') in mem mem.add(bcp47('en')) assert len(mem) == 1 # dupes are removed mem.add(bcp47('en-Latn')) mem.add(bcp47('english')) mem.add(bcp47('English')) mem.add(bcp47('eng')) assert len(mem) == 1 # dupes are removed mem = dict() mem[bcp47('en')] = 10 assert mem[bcp47('en')] == 10 assert mem[bcp47('en')] != 11 assert mem[bcp47('eng')] == 10 assert mem[bcp47('english')] == 10 assert mem[bcp47('English-Latn')] == 10
def is_compatible(lang1: Union[str, BCP47Tag], lang2: Union[str, BCP47Tag]): lang1 = lang1 if isinstance(lang1, BCP47Tag) else bcp47(lang1) lang2 = lang2 if isinstance(lang2, BCP47Tag) else bcp47(lang2) return lang1.is_compatible(lang2)