Esempio n. 1
0
def get_iso(d):
    zips = sorted(list(Path(d).glob('iso-639-3_Code_Tables_*.zip')),
                  key=lambda p: p.name)
    if zips:
        return ISO(zips[-1])

    return ISO(download_tables(d))  # pragma: no cover
Esempio n. 2
0
def check_tree(args):
    if args.args:
        iso = ISO(args.args[0] if Path(args.args[0]).exists() else None)
    else:
        iso = None

    tree = languoids_path('tree', repos=args.repos)
    glottocodes = Glottocodes()
    log.info('checking tree at %s' % tree)
    by_level = Counter()
    by_category = Counter()
    for lang in walk_tree(tree=tree):
        by_level.update([lang.level.name])
        if lang.level == Level.language:
            by_category.update([lang.category])

        if iso and lang.iso:
            if lang.iso not in iso:
                log.warn('invalid ISO-639-3 code: %s [%s]' % (lang.id, lang.iso))
            else:
                isocode = iso[lang.iso]
                if isocode.is_retired and lang.category != 'Bookkeeping':
                    msg = '%s %s' % (lang.id, repr(isocode))
                    if len(isocode.change_to) == 1:
                        msg += ' changed to %s' % repr(isocode.change_to[0])
                    log.warn(msg)

        if not lang.id.startswith('unun9') and lang.id not in glottocodes:
            log.error('unregistered glottocode %s' % lang.id)
        for attr in ['level', 'name', 'glottocode']:
            if not getattr(lang, attr):
                log.error('missing %s: %s' % (attr, lang.id))
        if not Glottocode.pattern.match(lang.dir.name):
            log.error('invalid directory name: %s' % lang.dir.name)
        if lang.level == Level.language:
            if lang.parent and lang.parent.level != Level.family:
                log.error('invalid nesting of language under {0}: {1}'.format(
                    lang.parent.level, lang.id))
            for child in lang.children:
                if child.level != Level.dialect:
                    log.error('invalid nesting of {0} under language: {1}'.format(
                        child.level, child.id))
        elif lang.level == Level.family:
            for d in lang.dir.iterdir():
                if d.is_dir():
                    break
            else:
                log.error('family without children: {0}'.format(lang.id))

    def log_counter(counter, name):
        msg = [name + ':']
        maxl = max([len(k) for k in counter.keys()]) + 1
        for k, l in counter.most_common():
            msg.append(('{0:<%s} {1:>8,}' % maxl).format(k + ':', l))
        msg.append(('{0:<%s} {1:>8,}' % maxl).format('', sum(list(counter.values()))))
        log.info('\n'.join(msg))

    log_counter(by_level, 'Languoids by level')
    log_counter(by_category, 'Languages by category')
    return by_level
Esempio n. 3
0
    def test_ISO(self):
        from clldutils.iso_639_3 import ISO, Code

        iso = ISO(FIXTURES.joinpath('iso.zip'))
        for attr in Code._type_map.values():
            self.assertIsInstance(getattr(iso, attr.lower()), list)

        self.assertEqual(len(iso.languages), 7)
        self.assertEqual(len(iso.macrolanguages[0].extension), 2)
        self.assertEqual(len(iso.languages[0].extension), 0)
        self.assertEqual(len(iso.retirements[0].change_to), 1)
        self.assertIn(iso['auv'].change_to[0], iso.languages)
        d = {iso['auv']: 1}
        self.assertIn(iso['auv'], d)
        self.assertIn('[twi]', repr(sorted(iso.values(), reverse=True)[0]))
        self.assertEqual('%s' % iso['aab'], 'Alumu-Tesu [aab]')
Esempio n. 4
0
def test_ISO(tmppath):
    from clldutils.iso_639_3 import ISO, Code

    dated_zip = tmppath / '20121201.zip'
    copy(FIXTURES.joinpath('iso.zip'), dated_zip)
    iso = ISO(dated_zip)
    assert '{0}'.format(iso) == 'ISO 639-3 code tables from 2012-12-01'

    iso = ISO(FIXTURES.joinpath('iso.zip'))
    assert '{0}'.format(iso) == 'ISO 639-3 code tables from 2016-07-25'
    for attr in Code._type_map.values():
        assert isinstance(getattr(iso, attr.lower()), list)

    assert len(iso.languages) == 7
    assert len(iso.macrolanguages[0].extension) == 2
    assert len(iso.languages[0].extension) == 0
    assert len(iso.retirements[0].change_to) == 1
    assert iso['auv'].change_to[0] in iso.languages
    d = {iso['auv']: 1}
    assert iso['auv'] in d
    assert '[twi]' in repr(sorted(iso.values(), reverse=True)[0])
    assert '%s' % iso['aab'] == 'Alumu-Tesu [aab]'
Esempio n. 5
0
def test_ISO_download(mocker):
    from clldutils.iso_639_3 import ISO

    def urlopen(req):
        if req.get_full_url().endswith('.zip'):
            return FIXTURES.joinpath('iso.zip').open('rb')
        return BytesIO(
            b' href="sites/iso639-3/files/downloads/iso-639-3_Code_Tables_12345678.zip" '
        )

    mocker.patch('clldutils.iso_639_3.urlopen', urlopen)
    iso = ISO()
    assert 'aab' in iso
Esempio n. 6
0
def test_ISO_download(mocker):
    from clldutils.iso_639_3 import ISO

    def urlopen(*args, **kw):
        return mocker.Mock(read=mocker.Mock(
            return_value=' href="iso-639-3_Code_Tables_12345678.zip" '))

    def urlretrieve(url, dest):
        copy(FIXTURES.joinpath('iso.zip'), dest)

    mocker.patch.multiple('clldutils.iso_639_3',
                          urlopen=urlopen,
                          urlretrieve=urlretrieve)
    iso = ISO()
    assert 'aab' in iso
Esempio n. 7
0
def missing_iso(args):
    tree = languoids_path('tree', repos=args.repos)
    iso = ISO(args.args[0] if args.args else None)

    changed_to = []
    for code in iso.retirements:
        changed_to.extend(code.change_to)
    changed_to = set(changed_to)

    ingl = set()
    for lang in walk_tree(tree=tree):
        if lang.iso:
            ingl.add(lang.iso)
    for code in sorted(iso.languages):
        if code.type == 'Individual/Living':
            if code not in changed_to:
                if code.code not in ingl:
                    print(code, code.type)