def get_iso(d): zips = sorted(list(Path(d).glob('iso-639-3_Code_Tables_*.zip')), key=lambda p: p.name) if zips: return ISO(zips[-1]) return ISO(download_tables(d)) # pragma: no cover
def check_tree(args): if args.args: iso = ISO(args.args[0] if Path(args.args[0]).exists() else None) else: iso = None tree = languoids_path('tree', repos=args.repos) glottocodes = Glottocodes() log.info('checking tree at %s' % tree) by_level = Counter() by_category = Counter() for lang in walk_tree(tree=tree): by_level.update([lang.level.name]) if lang.level == Level.language: by_category.update([lang.category]) if iso and lang.iso: if lang.iso not in iso: log.warn('invalid ISO-639-3 code: %s [%s]' % (lang.id, lang.iso)) else: isocode = iso[lang.iso] if isocode.is_retired and lang.category != 'Bookkeeping': msg = '%s %s' % (lang.id, repr(isocode)) if len(isocode.change_to) == 1: msg += ' changed to %s' % repr(isocode.change_to[0]) log.warn(msg) if not lang.id.startswith('unun9') and lang.id not in glottocodes: log.error('unregistered glottocode %s' % lang.id) for attr in ['level', 'name', 'glottocode']: if not getattr(lang, attr): log.error('missing %s: %s' % (attr, lang.id)) if not Glottocode.pattern.match(lang.dir.name): log.error('invalid directory name: %s' % lang.dir.name) if lang.level == Level.language: if lang.parent and lang.parent.level != Level.family: log.error('invalid nesting of language under {0}: {1}'.format( lang.parent.level, lang.id)) for child in lang.children: if child.level != Level.dialect: log.error('invalid nesting of {0} under language: {1}'.format( child.level, child.id)) elif lang.level == Level.family: for d in lang.dir.iterdir(): if d.is_dir(): break else: log.error('family without children: {0}'.format(lang.id)) def log_counter(counter, name): msg = [name + ':'] maxl = max([len(k) for k in counter.keys()]) + 1 for k, l in counter.most_common(): msg.append(('{0:<%s} {1:>8,}' % maxl).format(k + ':', l)) msg.append(('{0:<%s} {1:>8,}' % maxl).format('', sum(list(counter.values())))) log.info('\n'.join(msg)) log_counter(by_level, 'Languoids by level') log_counter(by_category, 'Languages by category') return by_level
def test_ISO(self): from clldutils.iso_639_3 import ISO, Code iso = ISO(FIXTURES.joinpath('iso.zip')) for attr in Code._type_map.values(): self.assertIsInstance(getattr(iso, attr.lower()), list) self.assertEqual(len(iso.languages), 7) self.assertEqual(len(iso.macrolanguages[0].extension), 2) self.assertEqual(len(iso.languages[0].extension), 0) self.assertEqual(len(iso.retirements[0].change_to), 1) self.assertIn(iso['auv'].change_to[0], iso.languages) d = {iso['auv']: 1} self.assertIn(iso['auv'], d) self.assertIn('[twi]', repr(sorted(iso.values(), reverse=True)[0])) self.assertEqual('%s' % iso['aab'], 'Alumu-Tesu [aab]')
def test_ISO(tmppath): from clldutils.iso_639_3 import ISO, Code dated_zip = tmppath / '20121201.zip' copy(FIXTURES.joinpath('iso.zip'), dated_zip) iso = ISO(dated_zip) assert '{0}'.format(iso) == 'ISO 639-3 code tables from 2012-12-01' iso = ISO(FIXTURES.joinpath('iso.zip')) assert '{0}'.format(iso) == 'ISO 639-3 code tables from 2016-07-25' for attr in Code._type_map.values(): assert isinstance(getattr(iso, attr.lower()), list) assert len(iso.languages) == 7 assert len(iso.macrolanguages[0].extension) == 2 assert len(iso.languages[0].extension) == 0 assert len(iso.retirements[0].change_to) == 1 assert iso['auv'].change_to[0] in iso.languages d = {iso['auv']: 1} assert iso['auv'] in d assert '[twi]' in repr(sorted(iso.values(), reverse=True)[0]) assert '%s' % iso['aab'] == 'Alumu-Tesu [aab]'
def test_ISO_download(mocker): from clldutils.iso_639_3 import ISO def urlopen(req): if req.get_full_url().endswith('.zip'): return FIXTURES.joinpath('iso.zip').open('rb') return BytesIO( b' href="sites/iso639-3/files/downloads/iso-639-3_Code_Tables_12345678.zip" ' ) mocker.patch('clldutils.iso_639_3.urlopen', urlopen) iso = ISO() assert 'aab' in iso
def test_ISO_download(mocker): from clldutils.iso_639_3 import ISO def urlopen(*args, **kw): return mocker.Mock(read=mocker.Mock( return_value=' href="iso-639-3_Code_Tables_12345678.zip" ')) def urlretrieve(url, dest): copy(FIXTURES.joinpath('iso.zip'), dest) mocker.patch.multiple('clldutils.iso_639_3', urlopen=urlopen, urlretrieve=urlretrieve) iso = ISO() assert 'aab' in iso
def missing_iso(args): tree = languoids_path('tree', repos=args.repos) iso = ISO(args.args[0] if args.args else None) changed_to = [] for code in iso.retirements: changed_to.extend(code.change_to) changed_to = set(changed_to) ingl = set() for lang in walk_tree(tree=tree): if lang.iso: ingl.add(lang.iso) for code in sorted(iso.languages): if code.type == 'Individual/Living': if code not in changed_to: if code.code not in ingl: print(code, code.type)