def check_tree(args): if args.args: iso = ISO(args.args[0] if Path(args.args[0]).exists() else None) else: iso = None tree = languoids_path('tree', repos=args.repos) glottocodes = Glottocodes() log.info('checking tree at %s' % tree) by_level = Counter() by_category = Counter() for lang in walk_tree(tree=tree): by_level.update([lang.level.name]) if lang.level == Level.language: by_category.update([lang.category]) if iso and lang.iso: if lang.iso not in iso: log.warn('invalid ISO-639-3 code: %s [%s]' % (lang.id, lang.iso)) else: isocode = iso[lang.iso] if isocode.is_retired and lang.category != 'Bookkeeping': msg = '%s %s' % (lang.id, repr(isocode)) if len(isocode.change_to) == 1: msg += ' changed to %s' % repr(isocode.change_to[0]) log.warn(msg) if not lang.id.startswith('unun9') and lang.id not in glottocodes: log.error('unregistered glottocode %s' % lang.id) for attr in ['level', 'name', 'glottocode']: if not getattr(lang, attr): log.error('missing %s: %s' % (attr, lang.id)) if not Glottocode.pattern.match(lang.dir.name): log.error('invalid directory name: %s' % lang.dir.name) if lang.level == Level.language: if lang.parent and lang.parent.level != Level.family: log.error('invalid nesting of language under {0}: {1}'.format( lang.parent.level, lang.id)) for child in lang.children: if child.level != Level.dialect: log.error('invalid nesting of {0} under language: {1}'.format( child.level, child.id)) elif lang.level == Level.family: for d in lang.dir.iterdir(): if d.is_dir(): break else: log.error('family without children: {0}'.format(lang.id)) def log_counter(counter, name): msg = [name + ':'] maxl = max([len(k) for k in counter.keys()]) + 1 for k, l in counter.most_common(): msg.append(('{0:<%s} {1:>8,}' % maxl).format(k + ':', l)) msg.append(('{0:<%s} {1:>8,}' % maxl).format('', sum(list(counter.values())))) log.info('\n'.join(msg)) log_counter(by_level, 'Languoids by level') log_counter(by_category, 'Languages by category') return by_level
def languoid(self, id_): if ISO_CODE_PATTERN.match(id_): for l in languoids.walk_tree(tree=self.tree): if l.iso_code == id_: return l else: for d in walk(self.tree, mode='dirs'): if d.name == id_: return languoids.Languoid.from_dir(d)
def tree2lff(tree=TREE): languoids = dict(dialect=defaultdict(list), language=defaultdict(list)) nodes = {} for l in walk_tree(tree=tree, nodes=nodes): if l.level in languoids: languoids[l.level][l.lff_group()].append(l.lff_language()) for level, languages in languoids.items(): with build_path('%sff.txt' % level[0]).open('w', encoding='utf8') as fp: fp.write('# -*- coding: utf-8 -*-\n') for path in sorted(languages): fp.write(path + '\n') for l in sorted(languages[path]): fp.write(l + '\n')
def test_lang2tree(self): old, new = self.tmp_path('old'), self.tmp_path('new') old.mkdir() new.mkdir() lang2tree( Languoid.from_name_id_level('name', 'abcd1234', Level.language), [('parent', 'abcd1233', Level.family)], old, {}) assert old.joinpath('abcd1233', 'abcd1234', 'abcd1234.ini').exists() lang2tree( Languoid.from_name_id_level('name', 'abcd1234', Level.language), [('parent', 'abcd1233', Level.family)], new, {l.id: l for l in walk_tree(old)}) assert new.joinpath('abcd1233', 'abcd1234', 'abcd1234.ini').exists()
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None): """ - get mapping glottocode -> Languoid from old tree - assemble new directory tree - for each path component in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - for each language/dialect in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - rm old tree - copy new tree """ # FIXME: instead of removing trees, we should just move the current one # from outdir to build, and then recreate in outdir. builddir = Path(builddir) if builddir else build_path('tree') old_tree = {l.id: l for l in walk_tree(tree)} if tree else {} out = Path(outdir or tree) if not out.parent.exists(): out.parent.mkdir() if out.exists(): if builddir.exists(): try: rmtree(builddir) except: # pragma: no cover pass if builddir.exists(): # pragma: no cover raise ValueError('please remove %s before proceeding' % builddir) # move the old tree out of the way shutil.move(out.as_posix(), builddir.as_posix()) out.mkdir() lffs = lffs or {} languages = {} for lang in read_lff(Level.language, fp=lffs.get(Level.language)): languages[lang.id] = lang lang2tree(lang, lang.lineage, out, old_tree) for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)): if not lang.lineage or lang.lineage[0][1] not in languages: raise ValueError('unattached dialect') # pragma: no cover lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree)
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None): """ - get mapping glottocode -> Languoid from old tree - assemble new directory tree - for each path component in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - for each language/dialect in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - rm old tree - copy new tree """ # FIXME: instead of removing trees, we should just move the current one # from outdir to build, and then recreate in outdir. builddir = Path(builddir) if builddir else build_path("tree") old_tree = {l.id: l for l in walk_tree(tree)} if tree else {} out = Path(outdir or tree) if not out.parent.exists(): out.parent.mkdir() if out.exists(): if builddir.exists(): try: rmtree(builddir) except: # pragma: no cover pass if builddir.exists(): # pragma: no cover raise ValueError("please remove %s before proceeding" % builddir) # move the old tree out of the way shutil.move(out.as_posix(), builddir.as_posix()) out.mkdir() lffs = lffs or {} languages = {} for lang in read_lff(Level.language, fp=lffs.get(Level.language)): languages[lang.id] = lang lang2tree(lang, lang.lineage, out, old_tree) for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)): if not lang.lineage or lang.lineage[0][1] not in languages: raise ValueError("unattached dialect") # pragma: no cover lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree)
def tree2lff(tree=TREE, out_paths=None): out_paths = out_paths or {} languoids = {Level.dialect: defaultdict(list), Level.language: defaultdict(list)} nodes = {} for l in walk_tree(tree=tree, nodes=nodes): if l.level in languoids: languoids[l.level][l.lff_group()].append(l.lff_language()) for level, languages in languoids.items(): out_path = out_paths.get(level, build_path("%sff.txt" % level.name[0])) with out_path.open("w", encoding="utf8") as fp: fp.write("# -*- coding: utf-8 -*-\n") for path in sorted(languages): fp.write(path + "\n") for l in sorted(languages[path]): fp.write(l + "\n")
def missing_iso(args): tree = languoids_path('tree', repos=args.repos) iso = ISO(args.args[0] if args.args else None) changed_to = [] for code in iso.retirements: changed_to.extend(code.change_to) changed_to = set(changed_to) ingl = set() for lang in walk_tree(tree=tree): if lang.iso: ingl.add(lang.iso) for code in sorted(iso.languages): if code.type == 'Individual/Living': if code not in changed_to: if code.code not in ingl: print(code, code.type)
def tree2lff(tree=TREE, out_paths=None): out_paths = out_paths or {} languoids = { Level.dialect: defaultdict(list), Level.language: defaultdict(list) } nodes = {} for l in walk_tree(tree=tree, nodes=nodes): if l.level in languoids: languoids[l.level][l.lff_group()].append(l.lff_language()) for level, languages in languoids.items(): out_path = out_paths.get(level, build_path('%sff.txt' % level.name[0])) with out_path.open('w', encoding='utf8') as fp: fp.write('# -*- coding: utf-8 -*-\n') for path in sorted(languages): fp.write(path + '\n') for l in sorted(languages[path]): fp.write(l + '\n')
def lff2tree(tree=TREE, outdir=None, test=False): """ - get mapping glottocode -> Languoid from old tree - assemble new directory tree - for each path component in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - for each language/dialect in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - rm old tree - copy new tree """ out = Path(outdir or build_path('tree')) if not out.parent.exists(): out.parent.mkdir() if out.exists(): rmtree(out) out.mkdir() old_tree = {l.id: l for l in walk_tree(tree)} if tree else {} languages = {} for lang in read_lff('language'): languages[lang.id] = lang lang2tree(lang, lang.lineage, out, old_tree) for lang in read_lff('dialect'): if not lang.lineage or lang.lineage[0][1] not in languages: raise ValueError('unattached dialect') lang2tree( lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree) if not test: rmtree(TREE, ignore_errors=True) copytree(out, TREE)
def test_walk_tree(self): from pyglottolog.languoids import walk_tree self.assertEqual(len(list(walk_tree(tree=self.tree))), 4)
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) data = Data() concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310') def concepticon_id(ids_code): for item in concept_list: if item['IDS_ID'] == ids_code: return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None def read(table): fname = args.data_file(table + '.all.csv') if not fname.exists(): fname = args.data_file(table + '.csv') return list(dsv.reader(fname, namedtuples=True)) dataset = common.Dataset( id=ids.__name__, name="IDS", description="The Intercontinental Dictionary Series", published=date(2015, 5, 25), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License', }, domain='ids.clld.org') DBSession.add(dataset) for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True): if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) DBSession.flush() data_desc = defaultdict(dict) for l in read('x_lg_data'): data_desc[l.lg_id][l.map_ids_data] = l.header # language lang iso_codes = {l.id: l.sil_code for l in read('sil_lang')} iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')} languages = [] exclude = [] for l in read('lang'): if l.status == '1': exclude.append(l.lg_id) continue lang_changed = LANGS.get(int(l.lg_id), {}) code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id) lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name)) if code: languages.append((code, lang)) data.add( models.Dictionary, l.lg_id, id=l.lg_id, name=l.lg_name, language=lang, default_representation=data_desc[l.lg_id].get('1'), alt_representation=data_desc[l.lg_id].get('2'), jsondata=dict(status=l.status, date=l.date)) iso2glotto = {} for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)): if l.iso: iso2glotto[l.iso] = l.id load_families( Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc') contributors = defaultdict(list) sources = defaultdict(list) for l in read('lang_compilers'): if l.lg_id in exclude: continue if l.name == "BIBIKO": continue #name lg_id what_did_id if int(l.what_did_id) in models.ROLES: contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id)) else: assert int(l.what_did_id) in [4, 395] sources[l.name].append(l.lg_id) for s, roles in contributors.items(): name = roles[0][0] c = data.add(common.Contributor, s, id=s, name=name) if name == 'Mary Ritchie Key': c.address = 'University of California, Irvine' for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]): sroles = sorted( [s[1] for s in specs], reverse=True, key=lambda what: what + 2 if what == 2 else what) what = sroles[0] DBSession.add(common.ContributionContributor( contribution=data['Dictionary'][lg], contributor=c, ord=what, primary=what == 2)) data.add( common.Contributor, 'bernardcomrie', id='bernardcomrie', name="Bernard Comrie", address="University of California, Santa Barbara") for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) #for i, name in enumerate(sorted(sources.keys())): # c = data.add(common.Source, name, id=str(i + 1), name=name, description=name) DBSession.flush() for name, lgs in sources.items(): for _src in name.split(';'): src = data['Source'].get(_src.strip()) if not src: print('-- missing source --', _src) raise ValueError for lg in lgs: if lg in exclude: continue assert lg in data['Dictionary'] DBSession.add(common.ContributionReference( contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk)) altnames = {} for i, l in enumerate(read('alt_names')): if l.name in altnames: identifier = altnames[l.name] else: identifier = data.add( common.Identifier, l.name, id='name-%s' % i, type='name', name=l.name, description='IDS') altnames[l.name] = identifier if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name: DBSession.add(common.LanguageIdentifier( identifier=identifier, language=data['IdsLanguage'][l.lg_id])) # parameter chapter/entry for l in read('chapter'): data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title) entries = {} for l in read('entry'): id_ = '%s-%s' % (l.chap_id, l.entry_id) name = l.trans_english if name in entries: entries[name] += 1 name = name + ' (%s)' % entries[name] else: entries[name] = 1 kw = { 'id': id_, 'name': name, 'concepticon_id': concepticon_id(id_), 'chapter': data['Chapter'][l.chap_id]} for ll in 'french russian spanish portugese'.split(): kw[ll] = getattr(l, 'trans_' + ll) data.add(models.Entry, id_, sub_code=l.entry_id, **kw) misaligned = [] DBSession.flush() for entity in 'IdsLanguage Entry Chapter Dictionary'.split(): for k in data[entity].keys()[:]: data[entity][k] = data[entity][k].pk synsets = set() counterparts = set() problems = defaultdict(list) for lg_id, entries in groupby( sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id): if lg_id in exclude or not lg_id: continue # keep the memory footprint reasonable transaction.commit() transaction.begin() language = common.Language.get(data['IdsLanguage'][lg_id]) desc = data_desc.get(lg_id, {}) words = defaultdict(list) for l in entries: if empty.match(l.data_1): continue entry_id = '%s-%s' % (l.chap_id, l.entry_id) if entry_id not in data['Entry']: continue #data.add( # models.Entry, entry_id, # id=entry_id, # name=entry_id, # concepticon_id=concepticon_id(entry_id), # sub_code=l.entry_id, # chapter_pk=data['Chapter'][l.chap_id]) #DBSession.flush() #data['Entry'][entry_id] = data['Entry'][entry_id].pk id_ = '%s-%s' % (entry_id, l.lg_id) if id_ in synsets: vs = models.Synset.get(id_) else: vs = models.Synset( id=id_, comment=get_string(l.comment or ''), alt_representation=get_string(l.data_2), language=language, contribution_pk=data['Dictionary'][l.lg_id], parameter_pk=data['Entry'][entry_id]) synsets.add(id_) trans1 = list(split_counterparts(l.data_1)) trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2)) if trans2: if len(trans2) != len(trans1): if language.id != '238': misaligned.append((l.chap_id, l.entry_id, l.lg_id)) #print('===', language.id, language.name) #print(l.data_1) #print(l.data_2) # 83 cases of misaligned transcriptions trans2 = None for i, word in enumerate(trans1): cid = id_ + '-' + str(i + 1 + len(vs.values)) if cid not in counterparts: v = models.Counterpart( id=cid, name=word, description=desc.get('1'), valueset=vs) words[word].append((v, trans2[i] if trans2 else None)) counterparts.add(cid) else: print(cid) #12 - 420 - 811 - 3 #5 - 390 - 818 - 3 #2 - 930 - 819 - 3 #2 - 930 - 819 - 3 #3 - 120 - 819 - 3 #10 - 140 - 822 - 3 #9 - 160 - 825 - 3 #2 - 430 - 829 - 4 for i, form in enumerate(words.keys()): # Since we identify words based on their string representation, we have to # make sure a word has the same alternative transcription for all meanings. if language.id == '238': alt_names = [] else: alt_names = set(norm(w[1] or '', desc.get('2'), language.id) for w in words[form]) alt_names = nfilter(alt_names) try: assert len(alt_names) <= 1 except AssertionError: problems[(language.id, language.name)].append(alt_names) word = models.Word( id='%s-%s' % (language.id, i + 1), name=form, description=desc.get('1'), language=language, alt_name=', '.join(alt_names) if alt_names else None, alt_description=desc.get('2') ) for v, _ in words[form]: word.counterparts.append(v) DBSession.add(word) DBSession.flush() with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp: fp.writerows(misaligned) # about 250 cases where alternative transcriotions do not covary across meanings. for k, v in problems.items(): print(k, len(v))