def test_bcp47(): assert iso3_code("en-GB") == 'eng' assert iso3_code("gsw-u-sd-chzh") == 'gsw' assert iso3_code("he-IL-u-ca-hebrew-tz-jeruslm") == 'heb' # language with an hyphen in their name should work as well assert iso3_code('Teke-Tsaayi') == "tyi" assert iso3_code('Umbu-Ungu') == "ubu"
def parse_tmx(data, log_every=DEF_PROGRESS): context = ET.iterparse(data, events=['end']) tus = (el for event, el in context if el.tag == 'tu') count = 0 st = t = time.time() for tu in tus: lang_seg = {} for tuv in tu.findall('tuv'): lang = [v for k, v in tuv.attrib.items() if k.endswith('lang')] seg = tuv.findtext('seg') if lang and seg: lang = iso3_code(lang[0], fail_error=True) seg = unescape(seg.strip()).replace('\n', ' ').replace('\t', ' ') if lang in lang_seg: log.warning( f"Language {lang} appears twice in same translation unit." ) lang_seg[lang] = seg yield lang_seg count += 1 if log_every and (time.time() - t) > log_every: elapsed = datetime.timedelta(seconds=round(time.time() - st)) log.info(f"{elapsed} :: Parsed: {count:,}") t = time.time() tu.clear()
def __init__(self, langs: Tuple[str, str], name: str, url: str, filename: Optional[str] = None, ext: Optional[str] = None, in_paths: Optional[List[str]] = None, in_ext: Optional[str] = None, cite: Optional[str] = None, cols: Optional[Tuple[int, int]] = None): assert isinstance(langs, tuple) assert len(langs) == 2 for ch in '.-/* ': assert ch not in name, f"Character '{ch}' is not permitted in name {name}" self.langs = tuple(iso3_code(l, fail_error=True) for l in langs) self.name = name self.url = url self.filename = filename orig_name = self.url.split('/')[-1] self.ext = ext or detect_extension(filename or orig_name) self.filename = self.filename or f'{self.name}.{self.ext}' self.in_paths = in_paths self.in_ext = in_ext self.cite = cite self.cols = cols self.is_archive = self.ext in ('zip', 'tar', 'tar.gz', 'tgz') if self.is_archive: assert self.in_paths and len( self.in_paths) > 0, 'Archive entries must have in_paths'
def __init__(self, data): self.data = data assert all(key in data for key in ['languages', 'scripts', 'countries' ]), 'malformed bcp4j data' self.scripts = {code: name for code, name in data['scripts']} self.countries = {code: name for code, name in data['countries']} self.languages = { code3: (code2, name) for code3, code2, name in data['languages'] } for key in self.languages: # validation assert key == iso3_code(key, fail_error=True) self.default_scripts = { } # these needs suppression; eng-Latn is just eng, as Latn is default for lang_code, script_code, lang_name in data['default_scripts']: code3 = iso3_code(lang_code, fail_error=True) assert script_code in self.scripts self.default_scripts[code3] = script_code
def LangPair(string): parts = string.split('-') if len(parts) != 2: msg = f'expected value of form "xx-yy" eg "de-en"; given {string}' raise argparse.ArgumentTypeError(msg) iso_codes = [iso3_code(part, fail_error=True) for part in parts] if iso_codes != parts: log.warning( f"Suggestion: Use ISO 639_3 codes {'-'.join(iso_codes)} instead of {string}." f" Let's make a little space for all 7000+ languages of our planet 😢." ) return tuple(iso_codes)
def main(langs=None): from mtdata.iso import iso3_code from mtdata.iso.iso639_3 import code_to_name, data as ISO639_3 langs = langs or parse_args().get('langs', []) if not langs: print(ISO639_3) else: print(f"Input\tISO639_3\tName") for lang in langs: iso_code = iso3_code(lang) iso_name = code_to_name(iso_code) if iso_code else '-none-' iso_code = iso_code or '-none-' print(f"{lang}\t{iso_code}\t{iso_name}")
def parse(self, tag) -> BCP47Tag: """ Parameters ---------- tag : tag to be parsed Returns ------- BCP47Tag """ code_orig = tag tag = tag.replace('_', '-').strip() assert tag parts = tag.split('-') assert 1 <= len( parts ) <= 3, f'BCP47 code longer than 3 parts not supported yet; given {code_orig}' lang, script, region = None, None, None # part 1: it has to be language lang = iso3_code(parts[0], default=None) if not lang or lang not in self.languages: raise ValueError( f'Unable to recognize {code_orig}; Unknown language') # assert lang in self.languages, f'Language "{lang}" is invalid; input: {code_orig}' parts = parts[1:] if parts: # part 2 can be either script or region code if parts[0].title() in self.scripts: script = parts[0].title() elif parts[0].upper() in self.countries: region = parts[0].upper() elif parts[0] == 'XX': # placeholder for a country pass else: raise ValueError(f'Unable to parse {code_orig}') parts = parts[1:] if parts: # part 3, if it exists, must be a region assert script assert not region if parts[0].upper() in self.countries: region = parts[0].upper() else: raise ValueError(f"Cant find {code_orig}; Unknown region") parts = parts[1:] assert not parts # all parts are consumed if script and lang in self.default_scripts and self.default_scripts[ lang] == script: script = None # suppress script return BCP47Tag(lang=lang, script=script, region=region)
def main(langs=None): from mtdata.iso import iso3_code from mtdata.iso.iso639_3 import code_to_name, data as ISO639_3 args = parse_args() langs = langs or args.get('langs', []) brief = args.get('brief') if not langs: print(ISO639_3) else: if not brief: print(f"Input\tISO639_3\tName") for lang in langs: iso_code = iso3_code(lang, fail_error=brief) iso_name = code_to_name(iso_code) if iso_code else '-none-' if brief: assert iso_code, f'Unable to resolve {lang} to valid language code.' print(f"{iso_code}\t{iso_name}") else: iso_code = iso_code or '-none-' print(f"{lang}\t{iso_code}\t{iso_name}")
exclude += ['OPUS100v', 'WMT-News'] datasets = requests.get( f'https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest' ).json() names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets['corpora']] elif type == 'sacrebleu': import sacrebleu names = [ f'sacrebleu_{name}' for name, meta in sacrebleu.DATASETS.items() if f'{source}-{target}' in meta or f'{target}-{source}' in meta ] elif type == 'mtdata': from mtdata.entry import LangPair, lang_pair from mtdata.index import get_entries from mtdata.iso import iso3_code source_tricode = iso3_code(source, fail_error=True) target_tricode = iso3_code(target, fail_error=True) exclude += ['opus', 'newstest', 'UNv1'] entries = sorted(get_entries( lang_pair(source_tricode + '-' + target_tricode), None, None, True), key=lambda entry: entry.did.group) names = [ f'mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}' for entry in entries ] else: print( f'Importer type {type} is unsupported. Supported importers: opus, mtdata, sacrebleu' ) cleaned = set()
def test_iso3_code(): assert iso3_code("kn") == 'kan' assert iso3_code("KN") == 'kan' assert iso3_code("Kannada") == 'kan' assert iso3_code("kannada") == 'kan' assert iso3_code("kan") == 'kan' assert iso3_code("KANNADA") == 'kan' assert iso3_code("KaNnAdA") == 'kan' assert iso3_code("KAN") == 'kan' assert iso3_code("ne") == 'nep' assert iso3_code("nep") == 'nep' assert iso3_code("Nepali") == 'nep' assert iso3_code("Nepali (individual)") == 'npi' assert iso3_code("xyz") == None assert iso3_code("xyz", default="Error") == "Error" try: iso3_code("xyz", fail_error=True) assert False, 'Expected an exception' except: assert True
def __post_init__(self): self.langs = tuple(iso3_code(l, fail_error=True) for l in self.langs) for t in self.tests: assert t for t in self.train: assert t