Exemple #1
0
def test_bcp47():
    assert iso3_code("en-GB") == 'eng'
    assert iso3_code("gsw-u-sd-chzh") == 'gsw'
    assert iso3_code("he-IL-u-ca-hebrew-tz-jeruslm") == 'heb'

    # language with an hyphen in their name should work as well
    assert iso3_code('Teke-Tsaayi') == "tyi"
    assert iso3_code('Umbu-Ungu') == "ubu"
Exemple #2
0
def parse_tmx(data, log_every=DEF_PROGRESS):
    context = ET.iterparse(data, events=['end'])
    tus = (el for event, el in context if el.tag == 'tu')
    count = 0
    st = t = time.time()
    for tu in tus:
        lang_seg = {}
        for tuv in tu.findall('tuv'):
            lang = [v for k, v in tuv.attrib.items() if k.endswith('lang')]
            seg = tuv.findtext('seg')
            if lang and seg:
                lang = iso3_code(lang[0], fail_error=True)
                seg = unescape(seg.strip()).replace('\n',
                                                    ' ').replace('\t', ' ')
                if lang in lang_seg:
                    log.warning(
                        f"Language {lang} appears twice in same translation unit."
                    )
                lang_seg[lang] = seg
        yield lang_seg
        count += 1
        if log_every and (time.time() - t) > log_every:
            elapsed = datetime.timedelta(seconds=round(time.time() - st))
            log.info(f"{elapsed} :: Parsed: {count:,}")
            t = time.time()
        tu.clear()
Exemple #3
0
    def __init__(self,
                 langs: Tuple[str, str],
                 name: str,
                 url: str,
                 filename: Optional[str] = None,
                 ext: Optional[str] = None,
                 in_paths: Optional[List[str]] = None,
                 in_ext: Optional[str] = None,
                 cite: Optional[str] = None,
                 cols: Optional[Tuple[int, int]] = None):

        assert isinstance(langs, tuple)
        assert len(langs) == 2
        for ch in '.-/* ':
            assert ch not in name, f"Character '{ch}' is not permitted in name {name}"

        self.langs = tuple(iso3_code(l, fail_error=True) for l in langs)
        self.name = name
        self.url = url
        self.filename = filename
        orig_name = self.url.split('/')[-1]
        self.ext = ext or detect_extension(filename or orig_name)
        self.filename = self.filename or f'{self.name}.{self.ext}'

        self.in_paths = in_paths
        self.in_ext = in_ext
        self.cite = cite
        self.cols = cols

        self.is_archive = self.ext in ('zip', 'tar', 'tar.gz', 'tgz')
        if self.is_archive:
            assert self.in_paths and len(
                self.in_paths) > 0, 'Archive entries must have in_paths'
Exemple #4
0
 def __init__(self, data):
     self.data = data
     assert all(key in data
                for key in ['languages', 'scripts', 'countries'
                            ]), 'malformed bcp4j data'
     self.scripts = {code: name for code, name in data['scripts']}
     self.countries = {code: name for code, name in data['countries']}
     self.languages = {
         code3: (code2, name)
         for code3, code2, name in data['languages']
     }
     for key in self.languages:  # validation
         assert key == iso3_code(key, fail_error=True)
     self.default_scripts = {
     }  # these needs suppression; eng-Latn is just eng, as Latn is default
     for lang_code, script_code, lang_name in data['default_scripts']:
         code3 = iso3_code(lang_code, fail_error=True)
         assert script_code in self.scripts
         self.default_scripts[code3] = script_code
Exemple #5
0
def LangPair(string):
    parts = string.split('-')
    if len(parts) != 2:
        msg = f'expected value of form "xx-yy" eg "de-en"; given {string}'
        raise argparse.ArgumentTypeError(msg)
    iso_codes = [iso3_code(part, fail_error=True) for part in parts]
    if iso_codes != parts:
        log.warning(
            f"Suggestion: Use ISO 639_3 codes {'-'.join(iso_codes)} instead of {string}."
            f" Let's make a little space for all 7000+ languages of our planet 😢."
        )
    return tuple(iso_codes)
Exemple #6
0
def main(langs=None):
    from mtdata.iso import iso3_code
    from mtdata.iso.iso639_3 import code_to_name, data as ISO639_3
    langs = langs or parse_args().get('langs', [])
    if not langs:
        print(ISO639_3)
    else:
        print(f"Input\tISO639_3\tName")
        for lang in langs:
            iso_code = iso3_code(lang)
            iso_name = code_to_name(iso_code) if iso_code else '-none-'
            iso_code = iso_code or '-none-'
            print(f"{lang}\t{iso_code}\t{iso_name}")
Exemple #7
0
    def parse(self, tag) -> BCP47Tag:
        """
        Parameters
        ----------
        tag : tag to be parsed

        Returns
        -------
            BCP47Tag
        """
        code_orig = tag
        tag = tag.replace('_', '-').strip()
        assert tag
        parts = tag.split('-')
        assert 1 <= len(
            parts
        ) <= 3, f'BCP47 code longer than 3 parts not supported yet; given {code_orig}'
        lang, script, region = None, None, None
        # part 1: it has to be language
        lang = iso3_code(parts[0], default=None)
        if not lang or lang not in self.languages:
            raise ValueError(
                f'Unable to recognize {code_orig}; Unknown language')
        # assert lang in self.languages, f'Language "{lang}" is invalid; input: {code_orig}'

        parts = parts[1:]
        if parts:  # part 2 can be either script or region code
            if parts[0].title() in self.scripts:
                script = parts[0].title()
            elif parts[0].upper() in self.countries:
                region = parts[0].upper()
            elif parts[0] == 'XX':  # placeholder for a country
                pass
            else:
                raise ValueError(f'Unable to parse {code_orig}')
            parts = parts[1:]
        if parts:  # part 3, if it exists, must be a region
            assert script
            assert not region
            if parts[0].upper() in self.countries:
                region = parts[0].upper()
            else:
                raise ValueError(f"Cant find {code_orig}; Unknown region")
            parts = parts[1:]
        assert not parts  # all parts are consumed
        if script and lang in self.default_scripts and self.default_scripts[
                lang] == script:
            script = None  # suppress script
        return BCP47Tag(lang=lang, script=script, region=region)
Exemple #8
0
def main(langs=None):
    from mtdata.iso import iso3_code
    from mtdata.iso.iso639_3 import code_to_name, data as ISO639_3
    args = parse_args()
    langs = langs or args.get('langs', [])
    brief = args.get('brief')
    if not langs:
        print(ISO639_3)
    else:
        if not brief:
            print(f"Input\tISO639_3\tName")
        for lang in langs:
            iso_code = iso3_code(lang, fail_error=brief)
            iso_name = code_to_name(iso_code) if iso_code else '-none-'
            if brief:
                assert iso_code, f'Unable to resolve {lang} to valid language code.'
                print(f"{iso_code}\t{iso_name}")
            else:
                iso_code = iso_code or '-none-'
                print(f"{lang}\t{iso_code}\t{iso_name}")
    exclude += ['OPUS100v', 'WMT-News']
    datasets = requests.get(
        f'https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest'
    ).json()
    names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets['corpora']]
elif type == 'sacrebleu':
    import sacrebleu
    names = [
        f'sacrebleu_{name}' for name, meta in sacrebleu.DATASETS.items()
        if f'{source}-{target}' in meta or f'{target}-{source}' in meta
    ]
elif type == 'mtdata':
    from mtdata.entry import LangPair, lang_pair
    from mtdata.index import get_entries
    from mtdata.iso import iso3_code
    source_tricode = iso3_code(source, fail_error=True)
    target_tricode = iso3_code(target, fail_error=True)
    exclude += ['opus', 'newstest', 'UNv1']
    entries = sorted(get_entries(
        lang_pair(source_tricode + '-' + target_tricode), None, None, True),
                     key=lambda entry: entry.did.group)
    names = [
        f'mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}'
        for entry in entries
    ]
else:
    print(
        f'Importer type {type} is unsupported. Supported importers: opus, mtdata, sacrebleu'
    )

cleaned = set()
Exemple #10
0
def test_iso3_code():
    assert iso3_code("kn") == 'kan'
    assert iso3_code("KN") == 'kan'
    assert iso3_code("Kannada") == 'kan'
    assert iso3_code("kannada") == 'kan'
    assert iso3_code("kan") == 'kan'
    assert iso3_code("KANNADA") == 'kan'
    assert iso3_code("KaNnAdA") == 'kan'
    assert iso3_code("KAN") == 'kan'

    assert iso3_code("ne") == 'nep'
    assert iso3_code("nep") == 'nep'
    assert iso3_code("Nepali") == 'nep'
    assert iso3_code("Nepali (individual)") == 'npi'

    assert iso3_code("xyz") == None
    assert iso3_code("xyz", default="Error") == "Error"
    try:
        iso3_code("xyz", fail_error=True)
        assert False, 'Expected an exception'
    except:
        assert True
Exemple #11
0
 def __post_init__(self):
     self.langs = tuple(iso3_code(l, fail_error=True) for l in self.langs)
     for t in self.tests:
         assert t
     for t in self.train:
         assert t